In [1]:
import pandas as pd

## Read the Data

In [2]:
df = pd.read_csv('./data/EEG_dataset.csv')

In [3]:
df.shape

(220, 105)

In [4]:
df.head(5)

Unnamed: 0,Subject,medical_history,Session,Age,Had_psychiatrist_followup,alpha_Channel_1,alpha_Channel_2,alpha_Channel_3,alpha_Channel_4,alpha_Channel_5,...,xi_Channel_1,xi_Channel_2,xi_Channel_3,xi_Channel_4,xi_Channel_5,omega_Channel_1,omega_Channel_2,omega_Channel_3,omega_Channel_4,omega_Channel_5
0,S1,,Session1,Age = 30,False,3.021524,1.396534,0.058007,0.12612,0.943253,...,0.247654,0.823943,1.358286,0.02847,2.340824,2.188695,1.502521,1.382312,2.733212,0.729483
1,S1,,Session2,30 Years Old,False,1.468518,1.36728,0.19593,0.273963,1.305134,...,1.142864,0.227482,1.757748,0.091825,0.609705,1.224623,2.127467,0.590222,0.181941,0.795337
2,S2,none,Session1,77 years,False,0.985983,1.177478,0.036685,0.289934,0.189099,...,1.857042,2.100252,0.724515,1.128692,0.221561,0.164472,6.085616,1.059927,0.022024,1.387955
3,S2,none,Session2,77 y/o,False,4.738183,0.13954,0.236183,0.78974,0.163361,...,1.191103,0.899053,0.955767,0.98905,0.689679,0.094675,2.925617,0.703198,0.101002,0.093286
4,S3,diabetes,Session1,50 y/o,False,0.975364,1.393986,0.661216,0.028347,0.372354,...,0.70325,0.637318,1.238808,0.253462,2.351435,1.111522,4.476291,0.623993,0.023572,0.27917


## Basic Exploratory Data Analysis (EDA)
 * Explore the data
 * Clean the data if/when necessary
 * Build new features whenever possible: something that would work for a majority of supervised ML models (classifiers)
 * Create the target column (`has_depression`)

### Data schema

In [5]:
for i, c in enumerate(df.dtypes):
    print(f'{df.dtypes.index[i]:<30}: {c}')

Subject                       : object
medical_history               : object
Session                       : object
Age                           : object
Had_psychiatrist_followup     : bool
alpha_Channel_1               : float64
alpha_Channel_2               : float64
alpha_Channel_3               : float64
alpha_Channel_4               : float64
alpha_Channel_5               : float64
beta_Channel_1                : float64
beta_Channel_2                : float64
beta_Channel_3                : float64
beta_Channel_4                : float64
beta_Channel_5                : float64
gamma_Channel_1               : float64
gamma_Channel_2               : float64
gamma_Channel_3               : float64
gamma_Channel_4               : float64
gamma_Channel_5               : float64
delta_Channel_1               : float64
delta_Channel_2               : float64
delta_Channel_3               : float64
delta_Channel_4               : float64
delta_Channel_5               : float64
epsilon

### Number of Subjects in the Study

In [6]:
df.Subject.nunique()

110

### Sessions

In [7]:
df.Session.unique()

array(['Session1', 'Session2'], dtype=object)

### Clean the Age Column

In [8]:
df.Age.sample(10)

166      Age: 81
15      Age = 54
135     77 years
207     Age = 53
119       25 y/o
111     Age = 30
94      68 years
62       100 y/o
55       Age: 72
118     Age = 25
Name: Age, dtype: object

In [9]:
df['age_years'] = df['Age'].str.extract('(\d+)').astype(int)
df[['Age', 'age_years']].sample(10)

Unnamed: 0,Age,age_years
199,Age: 11,11
33,Age: 10,10
25,25 y/o,25
31,73 years,73
58,Age: 51,51
119,25 y/o,25
150,Age = 62,62
86,Age = 16,16
26,59 years,59
59,51 years,51


###  Convert the `Had_psychiatrist_followup` to `int`

In [10]:
df['Had_psychiatrist_followup_int'] = df['Had_psychiatrist_followup'].astype(int)
df[['Had_psychiatrist_followup', 'Had_psychiatrist_followup_int']].sample(5)

Unnamed: 0,Had_psychiatrist_followup,Had_psychiatrist_followup_int
176,True,1
135,False,0
35,True,1
76,False,0
47,True,1


### Get a group of health-history-related binary features ("yes/no")

In [11]:
df['medical_history'].head(10)

0                                 NaN
1                                 NaN
2                                none
3                                none
4                            diabetes
5                            diabetes
6                                 NaN
7                                 NaN
8    ad, alcohol, ms, ocd, depression
9    ad, alcohol, ms, ocd, depression
Name: medical_history, dtype: object

In [12]:
df['health_cond_list'] = df['medical_history'].apply(lambda lst: [x.strip() for x in str(lst).split(',')]) 

df['health_cond_list'].head(10)

0                                 [nan]
1                                 [nan]
2                                [none]
3                                [none]
4                            [diabetes]
5                            [diabetes]
6                                 [nan]
7                                 [nan]
8    [ad, alcohol, ms, ocd, depression]
9    [ad, alcohol, ms, ocd, depression]
Name: health_cond_list, dtype: object

In [13]:
# pd.get_dummies(df['medical_history'].str.split(',', expand=True)).head()

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('health_cond_list')),
                          columns=mlb.classes_,
                          index=df.index).add_prefix('health_condition_'))

df.head()

Unnamed: 0,Subject,medical_history,Session,Age,Had_psychiatrist_followup,alpha_Channel_1,alpha_Channel_2,alpha_Channel_3,alpha_Channel_4,alpha_Channel_5,...,health_condition_depression,health_condition_diabetes,health_condition_heart attack,health_condition_marijuana,health_condition_ms,health_condition_nan,health_condition_none,health_condition_ocd,health_condition_schizophrenia,health_condition_unknown
0,S1,,Session1,Age = 30,False,3.021524,1.396534,0.058007,0.12612,0.943253,...,0,0,0,0,0,1,0,0,0,0
1,S1,,Session2,30 Years Old,False,1.468518,1.36728,0.19593,0.273963,1.305134,...,0,0,0,0,0,1,0,0,0,0
2,S2,none,Session1,77 years,False,0.985983,1.177478,0.036685,0.289934,0.189099,...,0,0,0,0,0,0,1,0,0,0
3,S2,none,Session2,77 y/o,False,4.738183,0.13954,0.236183,0.78974,0.163361,...,0,0,0,0,0,0,1,0,0,0
4,S3,diabetes,Session1,50 y/o,False,0.975364,1.393986,0.661216,0.028347,0.372354,...,0,1,0,0,0,0,0,0,0,0


In [15]:
health_condition_list = [c for c in list(df.columns) if c.startswith('health_condition_')]
health_condition_list

['health_condition_Unknown',
 'health_condition_ad',
 'health_condition_add',
 'health_condition_adhd',
 'health_condition_alcohol',
 'health_condition_anxiety',
 'health_condition_dementia',
 'health_condition_depresion',
 'health_condition_depress',
 'health_condition_depression',
 'health_condition_diabetes',
 'health_condition_heart attack',
 'health_condition_marijuana',
 'health_condition_ms',
 'health_condition_nan',
 'health_condition_none',
 'health_condition_ocd',
 'health_condition_schizophrenia',
 'health_condition_unknown']

### Create the Target Column: `has_depression`

In [16]:
df['has_depression'] = 0
condition_has_depression = df['medical_history'].fillna('N/A').str.lower().str.contains('depres')
df.loc[condition_has_depression,'has_depression'] = 1

In [17]:
df[condition_has_depression][['Subject', 'medical_history', 'has_depression']].head(10)

Unnamed: 0,Subject,medical_history,has_depression
8,S5,"ad, alcohol, ms, ocd, depression",1
9,S5,"ad, alcohol, ms, ocd, depression",1
16,S9,depression,1
17,S9,depression,1
18,S10,depression,1
19,S10,depression,1
22,S12,depression,1
23,S12,depression,1
24,S13,depression,1
25,S13,depression,1


### Question: Are there any subjects that have a contraversial medical history?

In [18]:
df['depression_diagnosis_consistent'] = df.groupby(['Subject'])['has_depression'].transform('sum')
df['depression_diagnosis_consistent'] = df['depression_diagnosis_consistent'].\
                                apply(lambda x: 1 if x == 2 or x == 0 else 0)
df[condition_has_depression][['Subject', 'medical_history', 
                              'has_depression','depression_diagnosis_consistent']].sample(10)

Unnamed: 0,Subject,medical_history,has_depression,depression_diagnosis_consistent
98,S50,depression,1,1
37,S19,"adhd, ms, ocd, depression",1,1
104,S53,"ms, alcohol, depression",1,1
64,S33,"ms, depresion",1,1
106,S54,"adhd, ms, ocd, depression",1,1
67,S34,depression,1,1
58,S30,"ms, alcohol, depression",1,1
149,S75,"add, marijuana, alcohol, depression",1,1
71,S36,"adhd, ms, ocd, depression",1,1
217,S109,"add, marijuana, alcohol, depression",1,1


In [19]:
df[df['depression_diagnosis_consistent'] == 0]

Unnamed: 0,Subject,medical_history,Session,Age,Had_psychiatrist_followup,alpha_Channel_1,alpha_Channel_2,alpha_Channel_3,alpha_Channel_4,alpha_Channel_5,...,health_condition_heart attack,health_condition_marijuana,health_condition_ms,health_condition_nan,health_condition_none,health_condition_ocd,health_condition_schizophrenia,health_condition_unknown,has_depression,depression_diagnosis_consistent


### Answer: No. The medical history appears consistent for all subjects

### Question: How many subjects/session have listed depression in the medical history?

In [20]:
# Total number of subjects/sessions with depression
df['has_depression'].sum()

128

In [21]:
# Fraction of subjects/sessions with depression
df['has_depression'].sum()/df.shape[0]

0.5818181818181818

### EEG: Range of values

In [22]:
df['alpha_Channel_1'].min(), df['alpha_Channel_1'].max()

(0.010634602, 12.13865509)

In [23]:
df['alpha_Channel_2'].min(), df['alpha_Channel_2'].max()

(0.008892988, 4.616958474)

In [24]:
df['alpha_Channel_3'].min(), df['alpha_Channel_3'].max()

(0.003474498, 7.312990807)

In [25]:
df['alpha_Channel_4'].min(), df['alpha_Channel_4'].max()

(0.000474194, 5.730581384)

In [26]:
df['alpha_Channel_5'].min(), df['alpha_Channel_5'].max()

(0.00028783, 5.668508554)

## Save the features and target in a new file

In [27]:
for c in df.columns:
    print(c)

Subject
medical_history
Session
Age
Had_psychiatrist_followup
alpha_Channel_1
alpha_Channel_2
alpha_Channel_3
alpha_Channel_4
alpha_Channel_5
beta_Channel_1
beta_Channel_2
beta_Channel_3
beta_Channel_4
beta_Channel_5
gamma_Channel_1
gamma_Channel_2
gamma_Channel_3
gamma_Channel_4
gamma_Channel_5
delta_Channel_1
delta_Channel_2
delta_Channel_3
delta_Channel_4
delta_Channel_5
epsilon_Channel_1
epsilon_Channel_2
epsilon_Channel_3
epsilon_Channel_4
epsilon_Channel_5
zeta_Channel_1
zeta_Channel_2
zeta_Channel_3
zeta_Channel_4
zeta_Channel_5
eta_Channel_1
eta_Channel_2
eta_Channel_3
eta_Channel_4
eta_Channel_5
theta_Channel_1
theta_Channel_2
theta_Channel_3
theta_Channel_4
theta_Channel_5
iota_Channel_1
iota_Channel_2
iota_Channel_3
iota_Channel_4
iota_Channel_5
kappa_Channel_1
kappa_Channel_2
kappa_Channel_3
kappa_Channel_4
kappa_Channel_5
lambda_Channel_1
lambda_Channel_2
lambda_Channel_3
lambda_Channel_4
lambda_Channel_5
mu_Channel_1
mu_Channel_2
mu_Channel_3
mu_Channel_4
mu_Channel_5
nu_

In [28]:
drop_columns = ['Subject', 'Session', 'medical_history' , 
                'Age', 'Had_psychiatrist_followup', 
                'depression_diagnosis_consistent', 'has_depression',
               'health_condition_depresion', 'health_condition_depress', 'health_condition_depression']

In [29]:
df.drop(columns=drop_columns).to_csv('./data/eeg_features.csv', index=False)

In [30]:
df[['has_depression']].to_csv('./data/eeg_depression_target.csv', index=False)

In [31]:
df_features = pd.read_csv('./data/eeg_features.csv')
df_features.head()

Unnamed: 0,alpha_Channel_1,alpha_Channel_2,alpha_Channel_3,alpha_Channel_4,alpha_Channel_5,beta_Channel_1,beta_Channel_2,beta_Channel_3,beta_Channel_4,beta_Channel_5,...,health_condition_dementia,health_condition_diabetes,health_condition_heart attack,health_condition_marijuana,health_condition_ms,health_condition_nan,health_condition_none,health_condition_ocd,health_condition_schizophrenia,health_condition_unknown
0,3.021524,1.396534,0.058007,0.12612,0.943253,1.697809,0.449966,0.448933,0.042286,0.503213,...,0,0,0,0,0,1,0,0,0,0
1,1.468518,1.36728,0.19593,0.273963,1.305134,0.679067,2.827446,3.36476,0.34679,0.202916,...,0,0,0,0,0,1,0,0,0,0
2,0.985983,1.177478,0.036685,0.289934,0.189099,0.137623,0.468998,3.133384,0.882983,1.95364,...,0,0,0,0,0,0,1,0,0,0
3,4.738183,0.13954,0.236183,0.78974,0.163361,0.707772,0.793514,2.799493,1.562262,0.158909,...,0,0,0,0,0,0,1,0,0,0
4,0.975364,1.393986,0.661216,0.028347,0.372354,1.112621,3.872213,1.441918,0.82796,1.879537,...,0,1,0,0,0,0,0,0,0,0


In [32]:
for c in df_features.columns:
    print(c)

alpha_Channel_1
alpha_Channel_2
alpha_Channel_3
alpha_Channel_4
alpha_Channel_5
beta_Channel_1
beta_Channel_2
beta_Channel_3
beta_Channel_4
beta_Channel_5
gamma_Channel_1
gamma_Channel_2
gamma_Channel_3
gamma_Channel_4
gamma_Channel_5
delta_Channel_1
delta_Channel_2
delta_Channel_3
delta_Channel_4
delta_Channel_5
epsilon_Channel_1
epsilon_Channel_2
epsilon_Channel_3
epsilon_Channel_4
epsilon_Channel_5
zeta_Channel_1
zeta_Channel_2
zeta_Channel_3
zeta_Channel_4
zeta_Channel_5
eta_Channel_1
eta_Channel_2
eta_Channel_3
eta_Channel_4
eta_Channel_5
theta_Channel_1
theta_Channel_2
theta_Channel_3
theta_Channel_4
theta_Channel_5
iota_Channel_1
iota_Channel_2
iota_Channel_3
iota_Channel_4
iota_Channel_5
kappa_Channel_1
kappa_Channel_2
kappa_Channel_3
kappa_Channel_4
kappa_Channel_5
lambda_Channel_1
lambda_Channel_2
lambda_Channel_3
lambda_Channel_4
lambda_Channel_5
mu_Channel_1
mu_Channel_2
mu_Channel_3
mu_Channel_4
mu_Channel_5
nu_Channel_1
nu_Channel_2
nu_Channel_3
nu_Channel_4
nu_Channel_5


In [33]:
df_target = pd.read_csv('./data/eeg_depression_target.csv')
df_target.sample(10)

Unnamed: 0,has_depression
215,1
166,0
202,1
167,0
15,0
32,1
85,0
106,1
143,0
182,1
