In [2]:
import pandas as pd

## Read the Data

In [3]:
df = pd.read_csv('./data/EEG_dataset.csv')

In [4]:
df.shape

(220, 105)

In [5]:
df.head(5)

Unnamed: 0,Subject,medical_history,Session,Age,Had_psychiatrist_followup,alpha_Channel_1,alpha_Channel_2,alpha_Channel_3,alpha_Channel_4,alpha_Channel_5,...,xi_Channel_1,xi_Channel_2,xi_Channel_3,xi_Channel_4,xi_Channel_5,omega_Channel_1,omega_Channel_2,omega_Channel_3,omega_Channel_4,omega_Channel_5
0,S1,,Session1,Age = 30,False,3.021524,1.396534,0.058007,0.12612,0.943253,...,0.247654,0.823943,1.358286,0.02847,2.340824,2.188695,1.502521,1.382312,2.733212,0.729483
1,S1,,Session2,30 Years Old,False,1.468518,1.36728,0.19593,0.273963,1.305134,...,1.142864,0.227482,1.757748,0.091825,0.609705,1.224623,2.127467,0.590222,0.181941,0.795337
2,S2,none,Session1,77 years,False,0.985983,1.177478,0.036685,0.289934,0.189099,...,1.857042,2.100252,0.724515,1.128692,0.221561,0.164472,6.085616,1.059927,0.022024,1.387955
3,S2,none,Session2,77 y/o,False,4.738183,0.13954,0.236183,0.78974,0.163361,...,1.191103,0.899053,0.955767,0.98905,0.689679,0.094675,2.925617,0.703198,0.101002,0.093286
4,S3,diabetes,Session1,50 y/o,False,0.975364,1.393986,0.661216,0.028347,0.372354,...,0.70325,0.637318,1.238808,0.253462,2.351435,1.111522,4.476291,0.623993,0.023572,0.27917


## Basic Exploratory Data Analysis (EDA)
 * Explore the data
 * Clean the data if/when necessary
 * Build new features whenever possible: something that would work for a majority of supervised ML models (classifiers)
 * Create the target column (`has_depression`)

### Data schema

In [6]:
for i, c in enumerate(df.dtypes):
    print(f'{df.dtypes.index[i]:<30}: {c}')

Subject                       : object
medical_history               : object
Session                       : object
Age                           : object
Had_psychiatrist_followup     : bool
alpha_Channel_1               : float64
alpha_Channel_2               : float64
alpha_Channel_3               : float64
alpha_Channel_4               : float64
alpha_Channel_5               : float64
beta_Channel_1                : float64
beta_Channel_2                : float64
beta_Channel_3                : float64
beta_Channel_4                : float64
beta_Channel_5                : float64
gamma_Channel_1               : float64
gamma_Channel_2               : float64
gamma_Channel_3               : float64
gamma_Channel_4               : float64
gamma_Channel_5               : float64
delta_Channel_1               : float64
delta_Channel_2               : float64
delta_Channel_3               : float64
delta_Channel_4               : float64
delta_Channel_5               : float64
epsilon

### Number of Subjects in the Study

In [7]:
df.Subject.nunique()

110

### Sessions

In [8]:
df.Session.unique()

array(['Session1', 'Session2'], dtype=object)

### Clean the Age Column

In [20]:
df.Age.sample(10)

7       35 Years Old
106          Age: 17
40          60 years
48      26 Years Old
57           Age: 23
11      47 Years Old
71      35 Years Old
66          Age = 16
72          Age = 35
67           Age: 16
Name: Age, dtype: object

In [22]:
df['age_years'] = df['Age'].str.extract('(\d+)').astype(int)
df[['Age', 'age_years']].sample(10)

Unnamed: 0,Age,age_years
100,69 Years Old,69
82,49 years,49
198,11 years,11
45,5 years,5
202,43 y/o,43
104,24 Years Old,24
219,22 y/o,22
91,Age: 61,61
77,Age = 45,45
174,Age = 21,21


### Get a group of health-history-related binary features ("yes/no")

In [29]:
df['medical_history'].head(10)

0                                 NaN
1                                 NaN
2                                none
3                                none
4                            diabetes
5                            diabetes
6                                 NaN
7                                 NaN
8    ad, alcohol, ms, ocd, depression
9    ad, alcohol, ms, ocd, depression
Name: medical_history, dtype: object

In [33]:
df['health_cond_list'] = df['medical_history'].apply(lambda lst: [x.strip() for x in str(lst).split(',')]) 

df['health_cond_list'].head(10)

0                                 [nan]
1                                 [nan]
2                                [none]
3                                [none]
4                            [diabetes]
5                            [diabetes]
6                                 [nan]
7                                 [nan]
8    [ad, alcohol, ms, ocd, depression]
9    [ad, alcohol, ms, ocd, depression]
Name: health_cond_list, dtype: object

In [25]:
# pd.get_dummies(df['medical_history'].str.split(',', expand=True)).head()

Unnamed: 0,0_Unknown,0_ad,0_add,0_adhd,0_alcohol,0_dementia,0_depression,0_diabetes,0_heart attack,0_ms,...,1_ ms,2_ alcohol,2_ depress,2_ depression,2_ ms,2_ ocd,3_ anxiety,3_ depression,3_ ocd,4_ depression
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('health_cond_list')),
                          columns=mlb.classes_,
                          index=df.index))

df.head()

Unnamed: 0,Subject,medical_history,Session,Age,Had_psychiatrist_followup,alpha_Channel_1,alpha_Channel_2,alpha_Channel_3,alpha_Channel_4,alpha_Channel_5,...,depression,diabetes,heart attack,marijuana,ms,nan,none,ocd,schizophrenia,unknown
0,S1,,Session1,Age = 30,False,3.021524,1.396534,0.058007,0.12612,0.943253,...,0,0,0,0,0,1,0,0,0,0
1,S1,,Session2,30 Years Old,False,1.468518,1.36728,0.19593,0.273963,1.305134,...,0,0,0,0,0,1,0,0,0,0
2,S2,none,Session1,77 years,False,0.985983,1.177478,0.036685,0.289934,0.189099,...,0,0,0,0,0,0,1,0,0,0
3,S2,none,Session2,77 y/o,False,4.738183,0.13954,0.236183,0.78974,0.163361,...,0,0,0,0,0,0,1,0,0,0
4,S3,diabetes,Session1,50 y/o,False,0.975364,1.393986,0.661216,0.028347,0.372354,...,0,1,0,0,0,0,0,0,0,0


In [36]:
df.columns

Index(['Subject', 'medical_history', 'Session', 'Age',
       'Had_psychiatrist_followup', 'alpha_Channel_1', 'alpha_Channel_2',
       'alpha_Channel_3', 'alpha_Channel_4', 'alpha_Channel_5',
       ...
       'depression', 'diabetes', 'heart attack', 'marijuana', 'ms', 'nan',
       'none', 'ocd', 'schizophrenia', 'unknown'],
      dtype='object', length=129)

### Create the Target Column: `has_depression`

In [9]:
df['has_depression'] = 0
condition_has_depression = df['medical_history'].fillna('N/A').str.lower().str.contains('depres')
df.loc[condition_has_depression,'has_depression'] = 1

In [11]:
df[condition_has_depression][['Subject', 'medical_history', 'has_depression']].head(10)

Unnamed: 0,Subject,medical_history,has_depression
8,S5,"ad, alcohol, ms, ocd, depression",1
9,S5,"ad, alcohol, ms, ocd, depression",1
16,S9,depression,1
17,S9,depression,1
18,S10,depression,1
19,S10,depression,1
22,S12,depression,1
23,S12,depression,1
24,S13,depression,1
25,S13,depression,1


## Question: Are there any subjects that have a contraversial medical history?

In [15]:
df['depression_diagnosis_consistent'] = df.groupby(['Subject'])['has_depression'].transform('sum')
df['depression_diagnosis_consistent'] = df['depression_diagnosis_consistent'].\
                                apply(lambda x: 1 if x == 2 or x == 0 else 0)
df[condition_has_depression][['Subject', 'medical_history', 
                              'has_depression','depression_diagnosis_consistent']].sample(10)

Unnamed: 0,Subject,medical_history,has_depression,depression_diagnosis_consistent
214,S108,"adhd, ms, ocd, depression",1,1
128,S65,"depression, anxiety",1,1
79,S40,"ms, alcohol, depression",1,1
24,S13,depression,1,1
119,S60,"ad, anxiety, depression",1,1
133,S67,"depression, anxiety",1,1
80,S41,"add, marijuana, alcohol, depression",1,1
140,S71,depression,1,1
125,S63,"ad, alcohol, ms, ocd, depression",1,1
105,S53,"ms, alcohol, depression",1,1


In [19]:
df[df['depression_diagnosis_consistent'] == 0]

Unnamed: 0,Subject,medical_history,Session,Age,Had_psychiatrist_followup,alpha_Channel_1,alpha_Channel_2,alpha_Channel_3,alpha_Channel_4,alpha_Channel_5,...,xi_Channel_5,omega_Channel_1,omega_Channel_2,omega_Channel_3,omega_Channel_4,omega_Channel_5,has_depression,depression_contraversial,depression_consistent,depression_diagnosis_consistent


## Answer: No. The medical history appears consistent for all subjects