# Cleaning Phenotypic Data

This notebook focuses on cleaning the phenotypic data for all sites. 
Many of the insights from the Exploratory Data Analysis notebook were used when writing this notebook.

The purpose of this notebook is to modify the dataframe to be ready to build a machine learning model. 
The primary issue with the current dataframe is the null values and excessive features. 

The resulting file is a .csv file of a cleaned dataframe. 

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_base_filepath():
    '''
    Access the filepath for the base folder of the project
    
    Input: None
    
    Output: The filepath to the root of the folder
    '''
    # Get current directory
    os.path.abspath(os.curdir)

    # Go up a directory level
    os.chdir('..')

    # Set baseline filepath to the project folder directory
    base_folder_filepath = os.path.abspath(os.curdir)
    return base_folder_filepath

In [3]:
def get_null_values(features, df):
    '''
    Generate a dataframe of the null value count and the minimum value
    
    Input:
        - A list of numeric features to find the null an min values for
        - A dataframe to access the features from
        
    Output: A dataframe of null value coutn and minimum value for each feature
    '''
    null_vals = dict()
    for col in features:
        null_vals[col] = (df[col].isnull().sum(), df[col].min())
        
    df_null_vals = pd.DataFrame(data=null_vals, index=['null_count', 'min_value'])
    return df_null_vals

In [4]:
def get_null_values(features, df):
    '''
    Generate a dataframe of the null value count and the minimum value
    
    Input:
        - A list of numeric features to find the null an min values for
        - A dataframe to access the features from
        
    Output: A dataframe of null value coutn and minimum value for each feature
    '''
    null_vals = dict()
    for col in features:
        null_vals[col] = (df[col].isnull().sum(), df[col].min())
        
    df_null_vals = pd.DataFrame(data=null_vals, index=['null_count', 'min_value'])
    return df_null_vals

In [5]:
# The folder for the project
base_folder_filepath = get_base_filepath() + '\\Data\\Phenotypic\\'

# Phenotypic data site folder
filepath = base_folder_filepath + 'allSubs_testSet_phenotypic_dx.csv'

phenotypics_filepath = base_folder_filepath + '\\Sites\\'

# Dataframe from filepath
df_pheno = pd.read_csv(filepath, index_col='ID')

In [6]:
# Create empty lists to store important values
features = [] # For the diagnosis
subjects = [] # For the patient id

# Iterate through each file in the folder
for site_pheno in os.listdir(phenotypics_filepath):
    # Access the filepath to the phenotypic data
    site_pheno_filepath = os.path.join(phenotypics_filepath, site_pheno)
    
    # Check if the current item in the directory is a file
    if os.path.isfile(site_pheno_filepath):
        # Read the file as a dataframe
        df_pheno = pd.read_csv(site_pheno_filepath, index_col='ScanDir ID')
        
        # Add the diagnosis to the list
        features.append(df_pheno)
        
        # Add the patient id to the list
        subjects.append(df_pheno.index)

In [7]:
df_all_phenos = pd.concat(features, axis=0)
df_all_phenos

Unnamed: 0_level_0,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,...,QC_S1_Rest_1,QC_S1_Rest_2,QC_S1_Rest_3,QC_S1_Rest_4,QC_S1_Rest_5,QC_S1_Rest_6,QC_S1_Anat,QC_S2_Rest_1,QC_S2_Rest_2,QC_S2_Anat
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1018959,3,0.0,12.36,1.0,0,,2.0,44.0,47.0,44.0,...,,,,,,,,,,
1019436,3,1.0,12.98,1.0,3,,2.0,71.0,60.0,66.0,...,,,,,,,,,,
1043241,3,1.0,9.12,1.0,0,,2.0,40.0,40.0,43.0,...,,,,,,,,,,
1266183,3,0.0,9.67,1.0,0,,2.0,47.0,44.0,43.0,...,,,,,,,,,,
1535233,3,1.0,9.64,0.0,0,,2.0,42.0,41.0,43.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15058,8,0.0,14.08,1.0,0,,,,,,...,1.0,1.0,1.0,,,,1.0,,,
15059,8,0.0,9.05,1.0,0,,,,,,...,0.0,1.0,0.0,,,,1.0,,,
15060,8,1.0,9.76,1.0,0,,,,,,...,0.0,0.0,0.0,,,,1.0,,,
15061,8,1.0,12.04,1.0,0,,,,,,...,0.0,0.0,0.0,,,,1.0,,,


In [8]:
df_all_phenos['DX'].isnull().sum()

0

In [9]:
df_all_phenos.columns

Index(['Site', 'Gender', 'Age', 'Handedness', 'DX', 'Secondary Dx ',
       'ADHD Measure', 'ADHD Index', 'Inattentive', 'Hyper/Impulsive',
       'IQ Measure', 'Verbal IQ', 'Performance IQ', 'Full2 IQ', 'Full4 IQ',
       'Med Status', 'QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4',
       'QC_Anatomical_1', 'QC_Anatomical_2', 'Study #', 'QC_S1_Rest_1',
       'QC_S1_Rest_2', 'QC_S1_Rest_3', 'QC_S1_Rest_4', 'QC_S1_Rest_5',
       'QC_S1_Rest_6', 'QC_S1_Anat', 'QC_S2_Rest_1', 'QC_S2_Rest_2',
       'QC_S2_Anat'],
      dtype='object')

In [10]:
drop_features = ['QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4',
       'QC_Anatomical_1', 'QC_Anatomical_2', 'Study #', 'QC_S1_Rest_1',
       'QC_S1_Rest_2', 'QC_S1_Rest_3', 'QC_S1_Rest_4', 'QC_S1_Rest_5',
       'QC_S1_Rest_6', 'QC_S1_Anat', 'QC_S2_Rest_1', 'QC_S2_Rest_2',
       'QC_S2_Anat']

df_all_phenos_filtered = df_all_phenos.copy()
df_all_phenos_filtered = df_all_phenos_filtered.drop(drop_features, axis=1)
df_all_phenos_filtered

Unnamed: 0_level_0,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1018959,3,0.0,12.36,1.0,0,,2.0,44.0,47.0,44.0,1,99.0,115.0,,103,1.0
1019436,3,1.0,12.98,1.0,3,,2.0,71.0,60.0,66.0,1,124.0,108.0,,122,1.0
1043241,3,1.0,9.12,1.0,0,,2.0,40.0,40.0,43.0,1,128.0,106.0,,120,1.0
1266183,3,0.0,9.67,1.0,0,,2.0,47.0,44.0,43.0,1,136.0,96.0,,120,1.0
1535233,3,1.0,9.64,0.0,0,,2.0,42.0,41.0,43.0,1,106.0,135.0,,122,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15058,8,0.0,14.08,1.0,0,,,,,,4,,,,115,
15059,8,0.0,9.05,1.0,0,,,,,,4,,,,103,
15060,8,1.0,9.76,1.0,0,,,,,,4,,,,137,
15061,8,1.0,12.04,1.0,0,,,,,,4,,,,98,


In [11]:
df_all_phenos_filtered.loc[df_all_phenos_filtered['Full4 IQ'].isnull(), 'Full4 IQ'] = df_all_phenos_filtered.loc[df_all_phenos_filtered['Full4 IQ'].isnull(), 'Full2 IQ']

In [12]:
df_all_phenos_filtered.loc[df_all_phenos_filtered['Full4 IQ'].isnull(), 'Full4 IQ']

Series([], Name: Full4 IQ, dtype: int64)

In [13]:
df_all_phenos_filtered['IQ'] = df_all_phenos_filtered['Full4 IQ']
df_all_phenos_filtered = df_all_phenos_filtered.drop(['Full4 IQ', 'Full2 IQ'], axis=1)

In [14]:
df_all_phenos_filtered

Unnamed: 0_level_0,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,IQ Measure,Verbal IQ,Performance IQ,Med Status,IQ
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1018959,3,0.0,12.36,1.0,0,,2.0,44.0,47.0,44.0,1,99.0,115.0,1.0,103
1019436,3,1.0,12.98,1.0,3,,2.0,71.0,60.0,66.0,1,124.0,108.0,1.0,122
1043241,3,1.0,9.12,1.0,0,,2.0,40.0,40.0,43.0,1,128.0,106.0,1.0,120
1266183,3,0.0,9.67,1.0,0,,2.0,47.0,44.0,43.0,1,136.0,96.0,1.0,120
1535233,3,1.0,9.64,0.0,0,,2.0,42.0,41.0,43.0,1,106.0,135.0,1.0,122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15058,8,0.0,14.08,1.0,0,,,,,,4,,,,115
15059,8,0.0,9.05,1.0,0,,,,,,4,,,,103
15060,8,1.0,9.76,1.0,0,,,,,,4,,,,137
15061,8,1.0,12.04,1.0,0,,,,,,4,,,,98


In [15]:
null_values = dict()
numeric_cols = ['Gender', 'Age', 'Handedness',
                'Verbal IQ', 'Performance IQ', 'IQ']

df_null_values_train = get_null_values(numeric_cols, df_all_phenos_filtered)

df_null_values_train.head()

Unnamed: 0,Gender,Age,Handedness,Verbal IQ,Performance IQ,IQ
null_count,1.0,0.0,1.0,140.0,140.0,0
min_value,0.0,7.09,-999.0,-999.0,-999.0,-999


In [16]:
for col in df_all_phenos_filtered.columns:
    df_all_phenos_filtered.loc[df_all_phenos_filtered[col] == -999, col] = None

In [17]:
null_values = dict()
numeric_cols = ['Gender', 'Age', 'Handedness',
                'Verbal IQ', 'Performance IQ', 'IQ']

df_null_values_train = get_null_values(numeric_cols, df_all_phenos_filtered)

df_null_values_train.head()

Unnamed: 0,Gender,Age,Handedness,Verbal IQ,Performance IQ,IQ
null_count,1.0,0.0,7.0,194.0,194.0,12.0
min_value,0.0,7.09,-0.2,65.0,54.0,73.0


In [18]:
df_all_phenos_filtered['Handedness'] = df_all_phenos_filtered['Handedness'].fillna(1)

In [19]:
df_all_phenos_filtered.groupby('DX')['Verbal IQ'].mean()

DX
0.0    114.563694
1.0    109.401639
2.0    105.666667
3.0    107.357895
Name: Verbal IQ, dtype: float64

In [25]:
df_all_phenos_filtered['Verbal IQ Filled'] = df_all_phenos_filtered['Verbal IQ']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'].isnull(), 'Verbal IQ Filled'] = df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'].isnull(), 'DX']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 0, 'Verbal IQ Filled'] = 114.563694
df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 1, 'Verbal IQ Filled'] = 109.401639
df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 2, 'Verbal IQ Filled'] = 105.666667
df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 3, 'Verbal IQ Filled'] = 107.357895

In [21]:
df_all_phenos_filtered.groupby('DX')['Performance IQ'].mean()

DX
0.0    109.984076
1.0    103.254098
2.0    113.333333
3.0    100.968421
Name: Performance IQ, dtype: float64

In [22]:
df_all_phenos_filtered['Performance IQ Filled'] = df_all_phenos_filtered['Performance IQ']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'].isnull(), 'Performance IQ Filled'] = df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'].isnull(), 'DX']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 0, 'Performance IQ Filled'] = 109.984076
df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 1, 'Performance IQ Filled'] = 103.254098
df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 2, 'Performance IQ Filled'] = 113.333333
df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 3, 'Performance IQ Filled'] = 100.968421

In [26]:
null_values = dict()
numeric_cols = ['Gender', 'Age', 'Handedness',
                'Verbal IQ Filled', 'Performance IQ Filled', 'IQ']

df_null_values_train = get_null_values(numeric_cols, df_all_phenos_filtered)

df_null_values_train.head()

Unnamed: 0,Gender,Age,Handedness,Verbal IQ Filled,Performance IQ Filled,IQ
null_count,1.0,0.0,0.0,0.0,0.0,12.0
min_value,0.0,7.09,-0.2,65.0,54.0,73.0


In [27]:
df_all_phenos_filtered.to_csv('testing2.csv')