# Cleaning Phenotypic Data

This notebook focuses on cleaning the phenotypic data for all sites. 
Many of the insights from the Exploratory Data Analysis notebook were used when writing this notebook.

The purpose of this notebook is to modify the dataframe to be ready to build a machine learning model. 
The primary issue with the current dataframe is the null values and excessive features. 

The resulting file is a .csv file of a cleaned dataframe. 

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_base_filepath():
    '''
    Access the filepath for the base folder of the project
    
    Input: None
    
    Output: The filepath to the root of the folder
    '''
    # Get current directory
    os.path.abspath(os.curdir)

    # Go up a directory level
    os.chdir('..')

    # Set baseline filepath to the project folder directory
    base_folder_filepath = os.path.abspath(os.curdir)
    return base_folder_filepath

In [3]:
def get_null_values(features, df):
    '''
    Generate a dataframe of the null value count and the minimum value
    
    Input:
        - A list of numeric features to find the null an min values for
        - A dataframe to access the features from
        
    Output: A dataframe of null value coutn and minimum value for each feature
    '''
    null_vals = dict()
    for col in features:
        null_vals[col] = (df[col].isnull().sum(), df[col].min())
        
    df_null_vals = pd.DataFrame(data=null_vals, index=['null_count', 'min_value'])
    return df_null_vals

In [4]:
def get_null_values(features, df):
    '''
    Generate a dataframe of the null value count and the minimum value
    
    Input:
        - A list of numeric features to find the null an min values for
        - A dataframe to access the features from
        
    Output: A dataframe of null value coutn and minimum value for each feature
    '''
    null_vals = dict()
    for col in features:
        null_vals[col] = (df[col].isnull().sum(), df[col].min())
        
    df_null_vals = pd.DataFrame(data=null_vals, index=['null_count', 'min_value'])
    return df_null_vals

In [5]:
# The folder for the project
base_folder_filepath = get_base_filepath() + '\\Data\\Phenotypic\\'

# Phenotypic data site folder
filepath = base_folder_filepath + 'allSubs_testSet_phenotypic_dx.csv'

phenotypics_filepath = base_folder_filepath + '\\Sites\\'

# Dataframe from filepath
df_pheno = pd.read_csv(filepath, index_col='ID')

In [6]:
# Create empty lists to store important values
features = [] # For the diagnosis
subjects = [] # For the patient id

# Iterate through each file in the folder
for site_pheno in os.listdir(phenotypics_filepath):
    # Access the filepath to the phenotypic data
    site_pheno_filepath = os.path.join(phenotypics_filepath, site_pheno)
    
    # Check if the current item in the directory is a file
    if os.path.isfile(site_pheno_filepath):
        # Read the file as a dataframe
        df_pheno = pd.read_csv(site_pheno_filepath, index_col='ScanDir ID')
        
        # Add the diagnosis to the list
        features.append(df_pheno)
        
        # Add the patient id to the list
        subjects.append(df_pheno.index)

In [7]:
df_all_phenos = pd.concat(features, axis=0)
df_all_phenos

In [8]:
df_all_phenos['DX'].isnull().sum()

In [9]:
df_all_phenos.columns

In [10]:
drop_features = ['QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4',
       'QC_Anatomical_1', 'QC_Anatomical_2', 'Study #', 'QC_S1_Rest_1',
       'QC_S1_Rest_2', 'QC_S1_Rest_3', 'QC_S1_Rest_4', 'QC_S1_Rest_5',
       'QC_S1_Rest_6', 'QC_S1_Anat', 'QC_S2_Rest_1', 'QC_S2_Rest_2',
       'QC_S2_Anat']

df_all_phenos_filtered = df_all_phenos.copy()
df_all_phenos_filtered = df_all_phenos_filtered.drop(drop_features, axis=1)
df_all_phenos_filtered

In [11]:
df_all_phenos_filtered.loc[df_all_phenos_filtered['Full4 IQ'].isnull(), 'Full4 IQ'] = df_all_phenos_filtered.loc[df_all_phenos_filtered['Full4 IQ'].isnull(), 'Full2 IQ']

In [12]:
df_all_phenos_filtered.loc[df_all_phenos_filtered['Full4 IQ'].isnull(), 'Full4 IQ']

In [13]:
df_all_phenos_filtered['IQ'] = df_all_phenos_filtered['Full4 IQ']
df_all_phenos_filtered = df_all_phenos_filtered.drop(['Full4 IQ', 'Full2 IQ'], axis=1)

In [14]:
df_all_phenos_filtered

In [15]:
null_values = dict()
numeric_cols = ['Gender', 'Age', 'Handedness',
                'Verbal IQ', 'Performance IQ', 'IQ']

df_null_values_train = get_null_values(numeric_cols, df_all_phenos_filtered)

df_null_values_train.head()

In [16]:
for col in df_all_phenos_filtered.columns:
    df_all_phenos_filtered.loc[df_all_phenos_filtered[col] == -999, col] = None

In [17]:
null_values = dict()
numeric_cols = ['Gender', 'Age', 'Handedness',
                'Verbal IQ', 'Performance IQ', 'IQ']

df_null_values_train = get_null_values(numeric_cols, df_all_phenos_filtered)

df_null_values_train.head()

In [18]:
df_all_phenos_filtered.loc[df_all_phenos_filtered['Gender'].isnull()]

In [19]:
df_all_phenos_filtered.groupby('DX')['Gender'].median()

In [20]:
df_all_phenos_filtered.groupby('Site')['Gender'].median()

In [21]:
df_all_phenos_filtered['Gender'] = df_all_phenos_filtered['Gender'].fillna(1)

In [22]:
df_all_phenos_filtered['Handedness'] = df_all_phenos_filtered['Handedness'].fillna(1)

In [23]:
df_all_phenos_filtered.loc[df_all_phenos_filtered['Handedness'] > 0, 'Handedness'] = 1
df_all_phenos_filtered.loc[df_all_phenos_filtered['Handedness'] < 0, 'Handedness'] = 0

In [24]:
df_all_phenos_filtered.groupby('DX')['Verbal IQ'].mean()

In [25]:
df_all_phenos_filtered['Verbal IQ Filled'] = df_all_phenos_filtered['Verbal IQ']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'].isnull(), 'Verbal IQ Filled'] = df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'].isnull(), 'DX']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 0, 'Verbal IQ Filled'] = 114.563694
df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 1, 'Verbal IQ Filled'] = 109.401639
df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 2, 'Verbal IQ Filled'] = 105.666667
df_all_phenos_filtered.loc[df_all_phenos_filtered['Verbal IQ Filled'] == 3, 'Verbal IQ Filled'] = 107.357895

In [26]:
df_all_phenos_filtered.groupby('DX')['Performance IQ'].mean()

In [27]:
df_all_phenos_filtered['Performance IQ Filled'] = df_all_phenos_filtered['Performance IQ']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'].isnull(), 'Performance IQ Filled'] = df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'].isnull(), 'DX']

df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 0, 'Performance IQ Filled'] = 109.984076
df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 1, 'Performance IQ Filled'] = 103.254098
df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 2, 'Performance IQ Filled'] = 113.333333
df_all_phenos_filtered.loc[df_all_phenos_filtered['Performance IQ Filled'] == 3, 'Performance IQ Filled'] = 100.968421

In [28]:
df_all_phenos_filtered.groupby('DX')['IQ'].mean()

In [29]:
df_all_phenos_filtered['IQ Filled'] = df_all_phenos_filtered['IQ']

df_all_phenos_filtered.loc[df_all_phenos_filtered['IQ Filled'].isnull(), 'IQ Filled'] = df_all_phenos_filtered.loc[df_all_phenos_filtered['IQ Filled'].isnull(), 'DX']

df_all_phenos_filtered.loc[df_all_phenos_filtered['IQ Filled'] == 0, 'IQ Filled'] = 113.745098
df_all_phenos_filtered.loc[df_all_phenos_filtered['IQ Filled'] == 1, 'IQ Filled'] = 107.620690
df_all_phenos_filtered.loc[df_all_phenos_filtered['IQ Filled'] == 2, 'IQ Filled'] = 110.800000
df_all_phenos_filtered.loc[df_all_phenos_filtered['IQ Filled'] == 3, 'IQ Filled'] = 104.710280

In [30]:
null_values = dict()
numeric_cols = ['Gender', 'Age', 'Handedness', 'Verbal IQ', 'Verbal IQ Filled',
                'Performance IQ', 'Performance IQ Filled', 'IQ', 'IQ Filled']

df_null_values_train = get_null_values(numeric_cols, df_all_phenos_filtered)

df_null_values_train.head()

In [31]:
features = ['Gender', 'Age', 'Handedness', 'Verbal IQ', 'Verbal IQ Filled',
                'Performance IQ', 'Performance IQ Filled', 'IQ', 'IQ Filled', 'DX']

df_all_phenos_filtered[features]

In [32]:
df_all_phenos_filtered[features].to_csv(base_folder_filepath + '2023.7.13-Cleaned_Phenotypic_All_Subjects.csv')

In [33]:
site_filepath = base_folder_filepath + 'Cleaned_Sites\\'

In [34]:
site_names = ['KKI', 'NYU', 'OHSU', 'Peking', 'Pittsburgh', 'WashU']
site_nums = [3,5,6,1,7,8]

In [35]:
for i in range(len(site_nums)):
    site_pheno = df_all_phenos_filtered.loc[df_all_phenos_filtered['Site'] == site_nums[i]]
    site_pheno[features].to_csv(site_filepath + '2023.7.13-Cleaned_' + site_names[i] + '_pheno.csv')