In [1]:
import pandas as pd

In [2]:
features_df = pd.read_csv('Datasets/features.csv')
labels_df = pd.read_csv('Datasets/labels.csv')

In [3]:
features_df.shape

(102, 93)

In [4]:
labels_df.shape

(110, 2)

In [5]:
# Merge on the basis of 'P_Id'
data = pd.merge(features_df, labels_df, on='P_Id')

In [6]:
data.head()

Unnamed: 0,P_Id,PPG_Rate_Mean,HRV_MeanNN,HRV_SDNN,HRV_SDANN1,HRV_SDNNI1,HRV_SDANN2,HRV_SDNNI2,HRV_SDANN5,HRV_SDNNI5,...,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC,anxiety_meter
0,101,72.160656,838.787356,161.144697,,,,,,,...,7.103751,1.291662,1.229793,1.392336,1.912396,1.691472,1.965384,4.166548,1.197715,7.0
1,102,94.333514,648.155039,196.751139,29.61793,192.373443,,,,,...,7.555224,1.371911,1.174007,1.288954,1.839586,1.707419,1.967041,2.788036,1.117846,16.0
2,103,91.269287,673.426396,240.470162,,,,,,,...,7.31614,1.471038,1.387296,1.315389,1.876066,1.833765,1.939231,4.690344,1.122028,10.0
3,104,76.862836,800.455621,295.652405,,,,,,,...,7.143425,1.156025,1.196983,1.302091,1.612782,1.572197,1.974733,2.787734,1.051012,8.0
4,105,68.40994,918.483221,655.450522,,,,,,,...,7.08494,1.133012,1.458232,1.30275,1.426864,1.427651,1.984928,2.193969,1.162819,15.0


In [7]:
def count_missing_values(df):
    """Count missing values in a DataFrame
    
    Args:
        df (pd.DataFrame): DataFrame for which to count missing values
        
    Returns:
        dict: A dictionary containing the count of missing values for each column with at least one missing value
    """
    missing_values = df.isna().sum()
    # Filter out columns with zero missing values
    missing_values = missing_values[missing_values > 0]
    return missing_values.to_dict()


In [8]:
missing_data_info = count_missing_values(data)

In [9]:
missing_data_info

{'HRV_SDANN1': 75,
 'HRV_SDNNI1': 75,
 'HRV_SDANN2': 101,
 'HRV_SDNNI2': 101,
 'HRV_SDANN5': 101,
 'HRV_SDNNI5': 101,
 'HRV_ULF': 101,
 'HRV_VLF': 51,
 'HRV_DFA_alpha2': 7,
 'HRV_MFDFA_alpha2_Width': 7,
 'HRV_MFDFA_alpha2_Peak': 7,
 'HRV_MFDFA_alpha2_Mean': 7,
 'HRV_MFDFA_alpha2_Max': 7,
 'HRV_MFDFA_alpha2_Delta': 7,
 'HRV_MFDFA_alpha2_Asymmetry': 7,
 'HRV_MFDFA_alpha2_Fluctuation': 7,
 'HRV_MFDFA_alpha2_Increment': 7}

In [10]:
# Remove columns with more than 20 missing values
missing_cols_to_remove = [col for col, count in missing_data_info.items() if count > 20]

In [11]:
missing_cols_to_remove

['HRV_SDANN1',
 'HRV_SDNNI1',
 'HRV_SDANN2',
 'HRV_SDNNI2',
 'HRV_SDANN5',
 'HRV_SDNNI5',
 'HRV_ULF',
 'HRV_VLF']

In [12]:
data = data.drop(columns=missing_cols_to_remove)

In [13]:
data.shape

(101, 86)

In [14]:
X = data.drop('anxiety_meter', axis=1)
y = data['anxiety_meter']

In [15]:
X

Unnamed: 0,P_Id,PPG_Rate_Mean,HRV_MeanNN,HRV_SDNN,HRV_RMSSD,HRV_SDSD,HRV_CVNN,HRV_CVSD,HRV_MedianNN,HRV_MadNN,...,HRV_SampEn,HRV_ShanEn,HRV_FuzzyEn,HRV_MSEn,HRV_CMSEn,HRV_RCMSEn,HRV_CD,HRV_HFD,HRV_KFD,HRV_LZC
0,101,72.160656,838.787356,161.144697,216.334225,216.962029,0.192116,0.257913,853.5,106.0059,...,1.662548,7.103751,1.291662,1.229793,1.392336,1.912396,1.691472,1.965384,4.166548,1.197715
1,102,94.333514,648.155039,196.751139,269.605729,270.131772,0.303556,0.415959,636.5,152.7078,...,1.868132,7.555224,1.371911,1.174007,1.288954,1.839586,1.707419,1.967041,2.788036,1.117846
2,103,91.269287,673.426396,240.470162,313.700844,314.500938,0.357085,0.465828,609.0,203.1162,...,1.746639,7.316140,1.471038,1.387296,1.315389,1.876066,1.833765,1.939231,4.690344,1.122028
3,104,76.862836,800.455621,295.652405,419.562698,420.816766,0.369355,0.524155,744.0,182.3598,...,1.377711,7.143425,1.156025,1.196983,1.302091,1.612782,1.572197,1.974733,2.787734,1.051012
4,105,68.409940,918.483221,655.450522,920.360670,923.485198,0.713623,1.002044,776.0,324.6894,...,1.668343,7.084940,1.133012,1.458232,1.302750,1.426864,1.427651,1.984928,2.193969,1.162819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,215,106.643368,570.586957,199.166962,245.539186,246.212727,0.349056,0.430327,501.5,64.4931,...,0.678190,6.976664,0.903258,1.056822,1.177499,1.092981,1.148055,1.907768,2.744730,0.736001
97,216,88.979798,691.214286,429.343966,515.791973,517.634875,0.621145,0.746211,550.5,123.7971,...,0.770202,6.889927,0.857505,0.837755,1.038622,0.956516,0.913066,1.897684,1.796485,1.018469
98,217,109.566313,553.231834,155.993383,196.165914,196.496027,0.281967,0.354582,500.0,50.4084,...,0.805586,7.184780,0.775767,0.978753,1.191947,1.138375,1.145786,1.932554,2.855720,0.763747
99,218,126.073719,478.073171,112.120465,133.252377,133.476196,0.234526,0.278728,456.0,34.8411,...,0.744972,6.726446,0.600827,0.474042,1.009792,0.961145,0.984105,1.917123,1.941572,0.839453
