# Exploratory Data Analysis Phenotypic Data

This notebook investigates the contents of the phenotypic data from all sites from the ADHD-200 Competition set. 

## Imports

Since this is only a exploratory data analysis, there aren't very many imports

- `os` for opening files

- `pandas` for dataframes

- `numpy` for arrays

- `matplotlib.pyplot` for plotting

- `seaborn` for customizing plots

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

### get_base_filepath()

Access the filepath for th ebase folder of the project. 
From here, any other asset of the project can be located.

In [2]:
def get_base_filepath():
    '''
    Access the filepath for the base folder of the project
    
    Input: None
    
    Output: The filepath to the root of the folder
    '''
    # Get current directory
    os.path.abspath(os.curdir)

    # Go up a directory level
    os.chdir('..')

    # Set baseline filepath to the project folder directory
    base_folder_filepath = os.path.abspath(os.curdir)
    return base_folder_filepath

In [3]:
# The folder for the project
base_folder_filepath = get_base_filepath()

# Preprocessed data site folder
sites_filepath = base_folder_filepath +  '\\Data\\Preprocessed_data\\Sites\\'

# Phenotypic data site folder
phenotypics_filepath = base_folder_filepath + '\\Data\\Phenotypic\\Sites\\'

In [4]:
# Create empty lists to store important values
subject_pheno = [] # For the patient id

# Iterate through each file in the folder
for site_pheno in os.listdir(phenotypics_filepath):
    # Access the filepath to the phenotypic data
    site_pheno_filepath = os.path.join(phenotypics_filepath, site_pheno)
    
    # Check if the current item in the directory is a file
    if os.path.isfile(site_pheno_filepath):
        # Read the file as a dataframe
        df_pheno = pd.read_csv(site_pheno_filepath, index_col='ScanDir ID')
        
        subject_pheno.append(df_pheno)

In [5]:
df_subject_pheno = pd.concat(subject_pheno)

In [6]:
df_subject_pheno.shape

(728, 33)

In [7]:
df_subject_pheno.head()

Unnamed: 0_level_0,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,...,QC_S1_Rest_1,QC_S1_Rest_2,QC_S1_Rest_3,QC_S1_Rest_4,QC_S1_Rest_5,QC_S1_Rest_6,QC_S1_Anat,QC_S2_Rest_1,QC_S2_Rest_2,QC_S2_Anat
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1018959,3,0.0,12.36,1.0,0,,2.0,44.0,47.0,44.0,...,,,,,,,,,,
1019436,3,1.0,12.98,1.0,3,,2.0,71.0,60.0,66.0,...,,,,,,,,,,
1043241,3,1.0,9.12,1.0,0,,2.0,40.0,40.0,43.0,...,,,,,,,,,,
1266183,3,0.0,9.67,1.0,0,,2.0,47.0,44.0,43.0,...,,,,,,,,,,
1535233,3,1.0,9.64,0.0,0,,2.0,42.0,41.0,43.0,...,,,,,,,,,,


In [8]:
df_subject_pheno.columns

Index(['Site', 'Gender', 'Age', 'Handedness', 'DX', 'Secondary Dx ',
       'ADHD Measure', 'ADHD Index', 'Inattentive', 'Hyper/Impulsive',
       'IQ Measure', 'Verbal IQ', 'Performance IQ', 'Full2 IQ', 'Full4 IQ',
       'Med Status', 'QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4',
       'QC_Anatomical_1', 'QC_Anatomical_2', 'Study #', 'QC_S1_Rest_1',
       'QC_S1_Rest_2', 'QC_S1_Rest_3', 'QC_S1_Rest_4', 'QC_S1_Rest_5',
       'QC_S1_Rest_6', 'QC_S1_Anat', 'QC_S2_Rest_1', 'QC_S2_Rest_2',
       'QC_S2_Anat'],
      dtype='object')

In [9]:
drop_features = ['QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4', 
                 'QC_Anatomical_1', 'QC_Anatomical_2', 
                 'QC_S1_Rest_1', 'QC_S1_Rest_2', 'QC_S1_Rest_3', 'QC_S1_Rest_4', 'QC_S1_Rest_5', 'QC_S1_Rest_6',
                 'QC_S1_Anat', 'QC_S2_Rest_1', 'QC_S2_Rest_2', 'QC_S2_Anat', 'Study #']

df_subject_pheno_filtered = df_subject_pheno.copy()
df_subject_pheno_filtered = df_subject_pheno_filtered.drop(drop_features, axis=1)

In [10]:
df_subject_pheno_filtered.head()

Unnamed: 0_level_0,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1018959,3,0.0,12.36,1.0,0,,2.0,44.0,47.0,44.0,1,99.0,115.0,,103,1.0
1019436,3,1.0,12.98,1.0,3,,2.0,71.0,60.0,66.0,1,124.0,108.0,,122,1.0
1043241,3,1.0,9.12,1.0,0,,2.0,40.0,40.0,43.0,1,128.0,106.0,,120,1.0
1266183,3,0.0,9.67,1.0,0,,2.0,47.0,44.0,43.0,1,136.0,96.0,,120,1.0
1535233,3,1.0,9.64,0.0,0,,2.0,42.0,41.0,43.0,1,106.0,135.0,,122,1.0


In [11]:
targets_features = ['DX', 'Secondary Dx ', 'ADHD Measure', 'ADHD Index', 'Inattentive', 'Hyper/Impulsive']

df_targets = df_subject_pheno_filtered[targets_features]
df_subject_pheno_filtered = df_subject_pheno_filtered.drop(targets_features, axis=1)

In [12]:
df_targets.head()

Unnamed: 0_level_0,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1018959,0,,2.0,44.0,47.0,44.0
1019436,3,,2.0,71.0,60.0,66.0
1043241,0,,2.0,40.0,40.0,43.0
1266183,0,,2.0,47.0,44.0,43.0
1535233,0,,2.0,42.0,41.0,43.0


In [13]:
df_subject_pheno_filtered.head()

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1018959,3,0.0,12.36,1.0,1,99.0,115.0,,103,1.0
1019436,3,1.0,12.98,1.0,1,124.0,108.0,,122,1.0
1043241,3,1.0,9.12,1.0,1,128.0,106.0,,120,1.0
1266183,3,0.0,9.67,1.0,1,136.0,96.0,,120,1.0
1535233,3,1.0,9.64,0.0,1,106.0,135.0,,122,1.0


In [14]:
df_subject_pheno_filtered['Gender'].isnull().sum()

1

In [25]:
min(df_subject_pheno_filtered['Gender'])

0.0

In [15]:
df_subject_pheno_filtered['Age'].isnull().sum()

0

In [26]:
min(df_subject_pheno_filtered['Age'])

7.09

In [16]:
df_subject_pheno_filtered['Handedness'].isnull().sum()

1

In [27]:
min(df_subject_pheno_filtered['Handedness'])

-999.0

In [28]:
df_subject_pheno_filtered[df_subject_pheno_filtered['Handedness'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1057962,5,1.0,8.78,-999.0,2,131.0,121.0,,129,1.0
1737393,5,1.0,11.24,-999.0,2,126.0,128.0,,130,1.0
10053,5,0.0,15.28,-999.0,2,99.0,103.0,,101,1.0
10058,5,1.0,14.2,-999.0,2,121.0,96.0,,109,1.0
10054,5,1.0,17.83,-999.0,2,-999.0,-999.0,,-999,1.0
10111,5,1.0,7.74,-999.0,2,108.0,88.0,,98,1.0


In [17]:
df_subject_pheno_filtered['IQ Measure'].isnull().sum()

0

In [30]:
min(df_subject_pheno_filtered['IQ Measure'])

-999

In [32]:
df_subject_pheno_filtered[df_subject_pheno_filtered['IQ Measure'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10068,5,1.0,13.3,0.5,-999,-999.0,-999.0,,-999,1.0


In [18]:
df_subject_pheno_filtered['Verbal IQ'].isnull().sum()

140

In [33]:
min(df_subject_pheno_filtered['Verbal IQ'])

-999.0

In [34]:
df_subject_pheno_filtered[df_subject_pheno_filtered['Verbal IQ'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2371032,3,0.0,10.73,1.0,1,-999.0,-999.0,,-999,1.0
10004,5,0.0,13.75,0.13,2,-999.0,-999.0,,-999,1.0
10056,5,1.0,15.63,0.64,2,-999.0,-999.0,,-999,2.0
10054,5,1.0,17.83,-999.0,2,-999.0,-999.0,,-999,1.0
10068,5,1.0,13.3,0.5,-999,-999.0,-999.0,,-999,1.0
10093,5,0.0,15.21,0.96,2,-999.0,-999.0,,-999,1.0
10094,5,0.0,15.21,0.55,2,-999.0,-999.0,,-999,1.0
10114,5,1.0,16.73,0.52,2,-999.0,-999.0,,-999,1.0
10120,5,0.0,17.09,0.83,2,-999.0,-999.0,,-999,1.0
10081,5,0.0,15.81,0.6,2,-999.0,-999.0,,-999,-999.0


In [19]:
df_subject_pheno_filtered['Performance IQ'].isnull().sum()

140

In [35]:
min(df_subject_pheno_filtered['Performance IQ'])

-999.0

In [36]:
df_subject_pheno_filtered[df_subject_pheno_filtered['Performance IQ'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2371032,3,0.0,10.73,1.0,1,-999.0,-999.0,,-999,1.0
10004,5,0.0,13.75,0.13,2,-999.0,-999.0,,-999,1.0
10056,5,1.0,15.63,0.64,2,-999.0,-999.0,,-999,2.0
10054,5,1.0,17.83,-999.0,2,-999.0,-999.0,,-999,1.0
10068,5,1.0,13.3,0.5,-999,-999.0,-999.0,,-999,1.0
10093,5,0.0,15.21,0.96,2,-999.0,-999.0,,-999,1.0
10094,5,0.0,15.21,0.55,2,-999.0,-999.0,,-999,1.0
10114,5,1.0,16.73,0.52,2,-999.0,-999.0,,-999,1.0
10120,5,0.0,17.09,0.83,2,-999.0,-999.0,,-999,1.0
10081,5,0.0,15.81,0.6,2,-999.0,-999.0,,-999,-999.0


In [20]:
df_subject_pheno_filtered['Full2 IQ'].isnull().sum()

639

In [37]:
min(df_subject_pheno_filtered['Full2 IQ'])

nan

In [38]:
df_subject_pheno_filtered[df_subject_pheno_filtered['Full2 IQ'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
16080,7,0.0,18.8,1.0,2,-999.0,-999.0,-999.0,121,
16087,7,1.0,18.96,1.0,2,-999.0,-999.0,-999.0,113,
16006,7,1.0,10.76,1.0,2,-999.0,-999.0,-999.0,78,
16082,7,0.0,18.85,1.0,2,-999.0,-999.0,-999.0,118,
16017,7,0.0,11.75,1.0,2,-999.0,-999.0,-999.0,111,
16050,7,1.0,15.78,1.0,2,-999.0,-999.0,-999.0,108,
16067,7,0.0,17.91,0.0,2,-999.0,-999.0,-999.0,96,
16070,7,0.0,18.31,1.0,2,-999.0,-999.0,-999.0,92,
16061,7,1.0,17.23,1.0,2,-999.0,-999.0,-999.0,90,
16076,7,0.0,18.6,1.0,2,-999.0,-999.0,-999.0,125,


In [40]:
df_subject_pheno_filtered['Full2 IQ'].isnull().sum() + len(df_subject_pheno_filtered[df_subject_pheno_filtered['Full2 IQ'] == -999])

681

In [21]:
df_subject_pheno_filtered['Full4 IQ'].isnull().sum()

0

In [22]:
min(df_subject_pheno_filtered['Full4 IQ'])

-999

In [24]:
df_subject_pheno_filtered[df_subject_pheno_filtered['Full4 IQ'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2371032,3,0.0,10.73,1.0,1,-999.0,-999.0,,-999,1.0
10004,5,0.0,13.75,0.13,2,-999.0,-999.0,,-999,1.0
10056,5,1.0,15.63,0.64,2,-999.0,-999.0,,-999,2.0
10054,5,1.0,17.83,-999.0,2,-999.0,-999.0,,-999,1.0
10068,5,1.0,13.3,0.5,-999,-999.0,-999.0,,-999,1.0
10093,5,0.0,15.21,0.96,2,-999.0,-999.0,,-999,1.0
10094,5,0.0,15.21,0.55,2,-999.0,-999.0,,-999,1.0
10114,5,1.0,16.73,0.52,2,-999.0,-999.0,,-999,1.0
10120,5,0.0,17.09,0.83,2,-999.0,-999.0,,-999,1.0
10081,5,0.0,15.81,0.6,2,-999.0,-999.0,,-999,-999.0


In [41]:
df_subject_pheno_filtered['Med Status'].isnull().sum()

150

In [42]:
min(df_subject_pheno_filtered['Med Status'])

-999.0

In [43]:
df_subject_pheno_filtered[df_subject_pheno_filtered['Med Status'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status
ScanDir ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1023964,5,1.0,8.290000,0.57,2,115.0,125.0,,123,-999.0
1187766,5,1.0,12.790000,0.73,2,109.0,134.0,,129,-999.0
1208795,5,0.0,9.570000,0.26,2,122.0,109.0,,118,-999.0
1471736,5,1.0,13.320000,0.91,2,106.0,91.0,,99,-999.0
1497055,5,1.0,8.560000,0.62,2,126.0,119.0,,125,-999.0
...,...,...,...,...,...,...,...,...,...,...
2561174,6,1.0,8.666667,1.00,2,,,,98,-999.0
2845989,6,0.0,7.666667,1.00,2,,,,102,-999.0
3286474,6,0.0,8.083333,1.00,2,,,,104,-999.0
3560456,6,0.0,7.833333,1.00,2,,,,126,-999.0


In [None]:
df_targets['Secondary Dx '] = df_targets['Secondary Dx '].fillna('none')

In [None]:
df_targets['DX'].isnull().sum()