# Exploratory Data Analysis Phenotypic Data

This notebook investigates the contents of the phenotypic data from all sites from the ADHD-200 Competition set. 

## Imports

Since this is only a exploratory data analysis, there aren't very many imports

- `os` for opening files

- `pandas` for dataframes

- `numpy` for arrays

- `matplotlib.pyplot` for plotting

- `seaborn` for customizing plots

In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

### get_base_filepath()

Access the filepath for th ebase folder of the project. 
From here, any other asset of the project can be located.

In [2]:
def get_base_filepath():
    '''
    Access the filepath for the base folder of the project
    
    Input: None
    
    Output: The filepath to the root of the folder
    '''
    # Get current directory
    os.path.abspath(os.curdir)

    # Go up a directory level
    os.chdir('..')

    # Set baseline filepath to the project folder directory
    base_folder_filepath = os.path.abspath(os.curdir)
    return base_folder_filepath

In [3]:
# The folder for the project
base_folder_filepath = get_base_filepath()

# Phenotypic data site folder
filepath = base_folder_filepath + '\\Data\\Phenotypic\\allSubs_testSet_phenotypic_dx.csv'

# Dataframe from filepath
df_pheno = pd.read_csv(filepath, index_col='ID')

In [4]:
df_pheno.shape

(197, 23)

In [5]:
df_pheno.head()

Unnamed: 0_level_0,Disclaimer,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,...,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,QC_Rest_1,QC_Rest_2,QC_Rest_3,QC_Rest_4,QC_Anatomical_1,QC_Anatomical_2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1038415,,1,1,14.92,1,3,ODD,1,52,34,...,109.0,103.0,-999.0,107.0,1,,,,1,
1201251,,1,1,12.33,1,3,,1,49,28,...,115.0,103.0,-999.0,110.0,1,,,,1,
1245758,,1,0,8.58,1,0,,1,35,20,...,121.0,88.0,-999.0,106.0,1,,,,1,
1253411,,1,1,8.08,1,0,,1,35,19,...,119.0,106.0,-999.0,114.0,1,,,,1,
1419103,,1,0,9.92,1,0,,1,41,22,...,124.0,76.0,-999.0,102.0,1,,,,1,


In [6]:
df_pheno.columns

Index(['Disclaimer', 'Site', 'Gender', 'Age', 'Handedness', 'DX',
       'Secondary Dx ', 'ADHD Measure', 'ADHD Index', 'Inattentive',
       'Hyper/Impulsive', 'Med Status', 'IQ Measure', 'Verbal IQ',
       'Performance IQ', 'Full2 IQ', 'Full4 IQ', 'QC_Rest_1', 'QC_Rest_2',
       'QC_Rest_3', 'QC_Rest_4', 'QC_Anatomical_1', 'QC_Anatomical_2'],
      dtype='object')

In [7]:
drop_features = ['Disclaimer',
                 'QC_Rest_1', 'QC_Rest_2', 'QC_Rest_3', 'QC_Rest_4', 
                 'QC_Anatomical_1', 'QC_Anatomical_2']

df_pheno_filtered = df_pheno.copy()
df_pheno_filtered = df_pheno.drop(drop_features, axis=1)

In [8]:
df_pheno_filtered.head()

Unnamed: 0_level_0,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,Med Status,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1038415,1,1,14.92,1,3,ODD,1,52,34,18,1,3.0,109.0,103.0,-999.0,107.0
1201251,1,1,12.33,1,3,,1,49,28,21,2,3.0,115.0,103.0,-999.0,110.0
1245758,1,0,8.58,1,0,,1,35,20,15,1,3.0,121.0,88.0,-999.0,106.0
1253411,1,1,8.08,1,0,,1,35,19,16,1,3.0,119.0,106.0,-999.0,114.0
1419103,1,0,9.92,1,0,,1,41,22,19,1,3.0,124.0,76.0,-999.0,102.0


In [9]:
targets_features = ['DX', 'Secondary Dx ', 
                    'ADHD Measure', 'ADHD Index', 'Inattentive', 
                    'Hyper/Impulsive', 'Med Status']

df_targets = df_pheno_filtered[targets_features]
df_pheno_filtered = df_pheno_filtered.drop(targets_features, axis=1)

In [10]:
df_targets.head()

Unnamed: 0_level_0,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,Med Status
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1038415,3,ODD,1,52,34,18,1
1201251,3,,1,49,28,21,2
1245758,0,,1,35,20,15,1
1253411,0,,1,35,19,16,1
1419103,0,,1,41,22,19,1


In [11]:
df_pheno_filtered.head()

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1038415,1,1,14.92,1,3.0,109.0,103.0,-999.0,107.0
1201251,1,1,12.33,1,3.0,115.0,103.0,-999.0,110.0
1245758,1,0,8.58,1,3.0,121.0,88.0,-999.0,106.0
1253411,1,1,8.08,1,3.0,119.0,106.0,-999.0,114.0
1419103,1,0,9.92,1,3.0,124.0,76.0,-999.0,102.0


In [12]:
df_pheno_filtered['Gender'].isnull().sum()

0

In [13]:
min(df_pheno_filtered['Gender'])

0

In [14]:
df_pheno_filtered['Age'].isnull().sum()

0

In [15]:
min(df_pheno_filtered['Age'])

7.26

In [16]:
df_pheno_filtered['Handedness'].isnull().sum()

2

In [17]:
df_pheno_filtered[df_pheno_filtered['Handedness'] == 'L']

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4125514,1,1,9.17,L,3.0,136.0,100.0,-999.0,121.0


In [18]:
df_pheno_filtered.loc[df_pheno_filtered['Handedness'] == 'L', 'Handedness'] = 0

# df_pheno_filtered[df_pheno_filtered['Handedness'] == 'L']['Handedness'] = 0

In [19]:
df_pheno_filtered['IQ Measure'].isnull().sum()

26

In [20]:
min(df_pheno_filtered['IQ Measure'])

1.0

In [21]:
df_pheno_filtered['Verbal IQ'].isnull().sum()

60

In [22]:
min(df_pheno_filtered['Verbal IQ'])

80.0

In [23]:
df_pheno_filtered['Performance IQ'].isnull().sum()

60

In [24]:
min(df_pheno_filtered['Performance IQ'])

67.0

In [25]:
df_pheno_filtered['Full2 IQ'].isnull().sum()

122

In [26]:
min(df_pheno_filtered['Full2 IQ'])

-999.0

In [27]:
df_pheno_filtered[df_pheno_filtered['Full2 IQ'] == -999]

Unnamed: 0_level_0,Site,Gender,Age,Handedness,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1038415,1,1,14.92,1,3.0,109.0,103.0,-999.0,107.0
1201251,1,1,12.33,1,3.0,115.0,103.0,-999.0,110.0
1245758,1,0,8.58,1,3.0,121.0,88.0,-999.0,106.0
1253411,1,1,8.08,1,3.0,119.0,106.0,-999.0,114.0
1419103,1,0,9.92,1,3.0,124.0,76.0,-999.0,102.0
1517058,1,1,9.75,1,3.0,141.0,138.0,-999.0,144.0
1581470,1,1,8.83,1,3.0,111.0,123.0,-999.0,118.0
1784368,1,1,8.92,1,3.0,136.0,89.0,-999.0,116.0
1849382,1,1,11.67,1,3.0,140.0,114.0,-999.0,131.0
1854691,1,0,8.83,1,3.0,117.0,108.0,-999.0,114.0


In [28]:
df_pheno_filtered['Full2 IQ'].isnull().sum() + len(df_pheno_filtered[df_pheno_filtered['Full2 IQ'] == -999])

173

In [29]:
df_pheno_filtered['Full4 IQ'].isnull().sum()

27

In [30]:
min(df_pheno_filtered['Full4 IQ'])

77.0

In [32]:
df_targets['Med Status'].isnull().sum()

111

In [None]:
min(df_pheno_filtered['Med Status'])

In [None]:
df_targets[df_targets['Med Status'] == -999]

In [None]:
df_targets['Secondary Dx '] = df_targets['Secondary Dx '].fillna('none')

In [None]:
df_targets['DX'].isnull().sum()