In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from scipy.stats import shapiro
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Data preprocessing

In [3]:
data = pd.read_csv('EEG.machinelearning_data_BRMH.csv', header=0, index_col=0)

In [4]:
data['specific.disorder'].value_counts()

Depressive disorder               199
Schizophrenia                     117
Healthy control                    95
Alcohol use disorder               93
Behavioral addiction disorder      93
Bipolar disorder                   67
Panic disorder                     59
Posttraumatic stress disorder      52
Social anxiety disorder            48
Obsessive compulsitve disorder     46
Acute stress disorder              38
Adjustment disorder                38
Name: specific.disorder, dtype: int64

## Checking the features

In [3]:
data.head()

Unnamed: 0_level_0,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
no.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,M,57.0,2012.8.30,,,Addictive disorder,Alcohol use disorder,35.998557,21.717375,21.51828,...,55.989192,16.739679,23.452271,45.67882,30.16752,16.918761,48.850427,9.42263,34.507082,28.613029
2,M,37.0,2012.9.6,6.0,120.0,Addictive disorder,Alcohol use disorder,13.425118,11.002916,11.942516,...,45.595619,17.510824,26.777368,28.201062,57.108861,32.375401,60.351749,13.900981,57.831848,43.463261
3,M,32.0,2012.9.10,16.0,113.0,Addictive disorder,Alcohol use disorder,29.94178,27.544684,17.150159,...,99.475453,70.654171,39.131547,69.920996,71.063644,38.534505,69.908764,27.180532,64.803155,31.485799
4,M,35.0,2012.10.8,18.0,126.0,Addictive disorder,Alcohol use disorder,21.496226,21.846832,17.364316,...,59.986561,63.822201,36.478254,47.117006,84.658376,24.724096,50.299349,35.319695,79.822944,41.141873
5,M,36.0,2012.10.18,16.0,112.0,Addictive disorder,Alcohol use disorder,37.775667,33.607679,21.865556,...,61.46272,59.166097,51.465531,58.635415,80.685608,62.138436,75.888749,61.003944,87.455509,70.531662


In [4]:
data[['age', 'IQ']].describe()

Unnamed: 0,age,IQ
count,945.0,932.0
mean,30.594804,101.580472
std,11.781592,17.022414
min,18.0,49.0
25%,21.73,91.0
50%,26.15,102.0
75%,35.45,114.0
max,71.88,145.0


In [5]:
data['sex'].value_counts()

M    601
F    344
Name: sex, dtype: int64

Our goal is to predict the specific disorder

In [6]:
data['main.disorder'].value_counts()

Mood disorder                         266
Addictive disorder                    186
Trauma and stress related disorder    128
Schizophrenia                         117
Anxiety disorder                      107
Healthy control                        95
Obsessive compulsive disorder          46
Name: main.disorder, dtype: int64

The data is unbalanced, so let's deal with that later.

## Data cleaning

There are features we are not going to need, sex is a categorical feature, and there are some NaN values. Let's fix that.

In [7]:
# create dummies for sex
dummy = pd.get_dummies(data['sex'])
dummy.rename(columns={'F':'sex.female', 'M':'sex.male'}, inplace=True)
data = pd.concat([data, dummy], axis=1)

In [8]:
# get target variable
y = data['main.disorder'].values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [9]:
data.columns[data.isna().any()]
# the following columns have NaNs
# so we will drop them

Index(['education', 'IQ', 'Unnamed: 122'], dtype='object')

In [10]:
# drop unnecessary columns
data.drop(['sex', 'eeg.date', 'education', 'main.disorder', 'IQ', 'age', 
           'specific.disorder', 'Unnamed: 122'], axis=1, inplace = True)

In [11]:
data.describe()

Unnamed: 0,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,AB.A.delta.d.F3,AB.A.delta.e.Fz,AB.A.delta.f.F4,AB.A.delta.g.F8,AB.A.delta.h.T3,AB.A.delta.i.C3,AB.A.delta.j.Cz,...,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2,sex.female,sex.male
count,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,...,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0,945.0
mean,20.182936,21.177584,17.749553,18.901698,20.447079,19.490329,17.042218,11.790176,16.405732,19.566879,...,56.959796,60.739169,69.829254,47.862489,66.832798,39.301406,66.153213,57.056207,0.364021,0.635979
std,11.282022,12.230662,10.003598,9.079482,9.742912,9.2525,9.272096,7.347929,8.2462,9.525142,...,18.326785,18.052887,17.725321,19.685722,17.028701,20.790933,18.088548,19.600107,0.481409,0.481409
min,3.27226,3.244199,3.050507,3.412618,5.066593,4.048931,2.552213,2.128977,2.21347,3.149313,...,6.926792,0.708008,2.421748,0.036664,1.032207,1.228502,0.363268,3.988805,0.0,0.0
25%,12.784872,13.019269,11.134327,12.460586,13.548645,12.637717,10.38177,7.042545,10.686639,13.130418,...,43.682444,48.374883,58.937785,32.581046,55.87207,22.049743,54.710605,43.955229,0.0,0.0
50%,17.065286,17.838251,15.541469,16.733004,18.065276,17.432568,15.050377,10.044354,14.525494,17.245633,...,56.657348,61.257972,72.298636,45.719426,68.238375,36.549938,67.988937,57.515871,0.0,1.0
75%,24.49276,25.654394,21.623961,23.3479,25.573096,23.956748,20.810308,14.246874,20.27138,23.792159,...,70.649245,73.9791,83.066877,62.081839,79.192418,54.169209,79.527764,71.626382,1.0,1.0
max,92.826192,101.515687,91.373456,82.544167,101.566662,77.197502,69.07123,77.283412,65.932521,88.474026,...,99.307895,100.0,99.581629,98.720067,99.650154,98.41332,100.0,99.287092,1.0,1.0


Normalize the data

In [12]:
scaler = StandardScaler()
data_std = scaler.fit_transform(data)

## Balancing

Since I don't want to use undersampling nor use techniques that are too complex for the course, I will try to use oversampling by dupication with class weights

In [13]:
# get class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weights

array([0.72580645, 1.26168224, 1.42105263, 0.5075188 , 2.93478261,
       1.15384615, 1.0546875 ])

In [14]:
# get our X
X = data_std
X.shape

(945, 1142)

In [92]:
# duplicate samples according to the weight of their label

X_weighted = []
y_weighted = []
for i, class_weight in enumerate(class_weights):
    samples_in_class = X[y_encoded == i]
    replicated_samples = np.repeat(samples_in_class, int(class_weight), axis=0)
    X_weighted.append(replicated_samples)
    y_weighted.append(np.repeat(i, replicated_samples.shape[0]))

X_weighted = np.concatenate(X_weighted, axis=0)
y_weighted = np.concatenate(y_weighted, axis=0)

X_combined = np.concatenate([X_weighted, X], axis=0)
y_combined = np.concatenate([y_weighted, y_encoded], axis=0)

In [93]:
X_combined.shape

(1484, 1142)

In [94]:
# check the value counts of duplicated labels
y_balanced = pd.Series(y_combined) 

value_counts = y_balanced.value_counts()
value_counts

3    266
6    256
5    234
1    214
2    190
0    186
4    138
dtype: int64

It looks better, let's proceed with splitting the data

In [95]:
feat_labels = data.columns.values.tolist()

In [148]:
# split into 70% training, 15% validation and 15% test

X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.3, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [149]:
print(X_train.shape, X_valid.shape, X_test.shape)
print(y_train.shape, y_valid.shape, y_test.shape)

(1038, 1142) (223, 1142) (223, 1142)
(1038,) (223,) (223,)


## Choosing models to test

As we have a multiclass classification problem, we can use supervised models such as KNN, Naive Bayes, SVM, linear SVC, Random Forest, Logistic Regression and SGD. 

In [166]:
statistic, p_value = shapiro(data)

# interpret the p-value
alpha = 0.05
if p_value > alpha:
    print("Data looks normally distributed (fail to reject H0)")
else:
    print("Data does not look normally distributed (reject H0)")

Data does not look normally distributed (reject H0)


Apparently data doesn't follow the normal distribution so we couldn't use GaussianNB. However, since we have negative values, we can't use other NBs, so let's try GaussianNB anyway.

Additionally, let's all the other models!

In [36]:
models_df = pd.DataFrame(columns =['Accuracy', 'F1-score'])

# Training models

## Random Forest

In [21]:
n_estimators = [100, 1000, 10000]
criterion = ['gini', 'entropy']
max_depth = [3, 5, 10, None]
min_samples_split = [2, 5, 10]
max_features = ['auto', 'sqrt', 'log2']
bootstrap = [True, False] 

In [22]:
randomforest_df = pd.DataFrame(columns = ['n_estimators', 'criterion', 'max_depth', 'min_samples_split', 'max_features', 'bootstrap', 'accuracy', 'f1_score'])

<font size="5">That took more than 6 hours to run! We might not want to run it again</font>

In [23]:
n_samples = 0
for n_estim in n_estimators:
    for crit in criterion:
        for depth in max_depth:
            for min_samples in min_samples_split:
                for max_feat in max_features:
                    for bs in bootstrap:
                        
                        forest = RandomForestClassifier(n_estimators=n_estim, max_depth=depth,
                                                       criterion=crit, min_samples_split=min_samples,
                                                       max_features=max_feat, bootstrap=bs)
                        forest.fit(X_train, y_train)

                        y_pred = forest.predict(X_valid)

                        acc = accuracy_score(y_valid, y_pred)

                        f1 = f1_score(y_valid, y_pred, average='weighted')

                        row = [n_estim, crit, depth, min_samples, max_feat, bs, acc, f1]

                        randomforest_df.loc[n_samples] = row

                        n_samples+=1

In [26]:
randomforest_df.sort_values("f1_score", ascending=False, inplace=True)
randomforest_df.head()

Unnamed: 0,n_estimators,criterion,max_depth,min_samples_split,max_features,bootstrap,accuracy,f1_score
421,10000,entropy,,5,auto,False,0.695067,0.698314
63,100,gini,,5,sqrt,False,0.681614,0.694154
271,1000,entropy,,2,auto,False,0.67713,0.693482
141,100,entropy,,10,sqrt,False,0.686099,0.692269
415,10000,entropy,,2,auto,False,0.681614,0.689099


In [27]:
randomforest_df.to_csv('randomforest_df', header=True, index=True)

In [28]:
best = randomforest_df.iloc[0,:]
best

n_estimators            10000
criterion             entropy
max_depth                None
min_samples_split           5
max_features             auto
bootstrap               False
accuracy             0.695067
f1_score             0.698314
Name: 421, dtype: object

In [29]:
forest = RandomForestClassifier(n_estimators=best.n_estimators, max_depth=best.max_depth,
                                criterion=best.criterion, min_samples_split=best.min_samples_split)

In [30]:
forest.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', min_samples_split=5,
                       n_estimators=10000)

In [31]:
y_pred = forest.predict(X_test)

In [32]:
f1_rf = round(f1_score(y_test, y_pred, average='weighted'), 2)
f1_rf

0.67

In [33]:
acc_rf = round(accuracy_score(y_test, y_pred), 2)
acc_rf

0.68

In [38]:
models_df.loc['Random forest'] = [f1_rf, acc_rf]

## KNN

In [40]:
neighbors = [3,5,7,9]
weights = ['uniform', 'distance']

In [41]:
knn_df = pd.DataFrame(columns = ['neighbors', 'weights', 'accuracy', 'f1_score'])

In [42]:
n_samples = 0

for k in neighbors:
    for w in weights:
        knn = KNeighborsClassifier(n_neighbors=k, weights=w)
        knn.fit(X_train, y_train)
        
        y_pred = knn.predict(X_valid)
                
        acc = accuracy_score(y_valid, y_pred)

        f1 = f1_score(y_valid, y_pred, average='weighted')
                
        row = [k, w, acc, f1]
                
        knn_df.loc[n_samples] = row
                
        n_samples+=1

In [43]:
knn_df.sort_values('f1_score', ascending=False, inplace=True)
knn_df

Unnamed: 0,neighbors,weights,accuracy,f1_score
5,7,distance,0.61435,0.589549
7,9,distance,0.605381,0.586665
3,5,distance,0.600897,0.575976
1,3,distance,0.609865,0.573856
0,3,uniform,0.278027,0.266264
6,9,uniform,0.210762,0.214906
4,7,uniform,0.215247,0.21419
2,5,uniform,0.210762,0.204735


In [44]:
best = knn_df.iloc[0,:]
best

neighbors           7
weights      distance
accuracy      0.61435
f1_score     0.589549
Name: 5, dtype: object

In [45]:
knn = KNeighborsClassifier(n_neighbors=best.neighbors, weights=best.weights)

In [46]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=7, weights='distance')

In [47]:
y_pred = knn.predict(X_test)

In [48]:
f1_knn = round(f1_score(y_test, y_pred, average='weighted'),2)
f1_knn

0.58

In [49]:
acc_knn = round(accuracy_score(y_test, y_pred),2)
acc_knn

0.61

In [50]:
models_df.loc['K-Nearest Neighbors'] = [f1_knn, acc_knn]

## Gaussian Naive Bayes

In [51]:
gnb = GaussianNB()

In [52]:
gnb.fit(X_train, y_train)

GaussianNB()

In [53]:
y_pred = gnb.predict(X_test)

In [54]:
f1_gnb = round(f1_score(y_test, y_pred, average='weighted'),2)
f1_gnb

0.19

In [55]:
acc_gnb = round(accuracy_score(y_test, y_pred),2)
acc_gnb

0.22

Features are dependent!

In [135]:
models_df.loc['Gaussian NB'] = [f1_gnb, acc_gnb]

## Logistic Regression

In [57]:
multi_class = ['ovr', 'multinomial']
max_iter = [1000, 10000, 100000]

In [58]:
logisticr_df = pd.DataFrame(columns = ['multi_class', 'max_iter', 'accuracy', 'f1_score'])

In [59]:
n_samples = 0

for multi in multi_class:
    for n_iter in max_iter:
    
        logistic = LogisticRegression(max_iter = n_iter, multi_class=multi)

        logistic.fit(X_train, y_train)

        y_pred = logistic.predict(X_valid)

        acc = accuracy_score(y_valid, y_pred)

        f1 = f1_score(y_valid, y_pred, average='weighted')

        row = [multi, n_iter, acc, f1]

        logisticr_df.loc[n_samples] = row

        n_samples+=1

In [60]:
logisticr_df.sort_values('f1_score', ascending=False, inplace=True)
logisticr_df.head()

Unnamed: 0,multi_class,max_iter,accuracy,f1_score
0,ovr,1000,0.636771,0.624175
1,ovr,10000,0.636771,0.624175
2,ovr,100000,0.636771,0.624175
3,multinomial,1000,0.632287,0.623296
4,multinomial,10000,0.632287,0.623296


In [61]:
best = logisticr_df.iloc[0,:]
best

multi_class         ovr
max_iter           1000
accuracy       0.636771
f1_score       0.624175
Name: 0, dtype: object

In [62]:
logistic = LogisticRegression(max_iter = best.max_iter, multi_class=best.multi_class)

In [63]:
logistic.fit(X_train, y_train)

LogisticRegression(max_iter=1000, multi_class='ovr')

In [64]:
y_pred = logistic.predict(X_test)

In [65]:
f1_log = round(f1_score(y_test, y_pred, average='weighted'),2)
f1_log

0.58

In [66]:
acc_log = round(accuracy_score(y_test, y_pred),2)
acc_log

0.61

In [67]:
models_df.loc['Logistic Regression'] = [f1_log, acc_log]

## Decision trees

In [68]:
criterion = ['gini', 'entropy']
splitter =['best', 'random']
max_depth = [3, 5, 10, None]
min_samples_split = [2, 5, 10]

In [69]:
decisiontree_df = pd.DataFrame(columns = ['criterion', 'splitter', 'max_depth', 'min_samples_split', 'accuracy', 'f1_score'])

In [70]:
n_samples = 0
for crit in criterion:
    for split in splitter:
        for depth in max_depth:
            for min_samples in min_samples_split:
                decisiontree = tree.DecisionTreeClassifier(splitter = split, max_depth=depth,
                                               criterion=crit, min_samples_split=min_samples)
                decisiontree.fit(X_train, y_train)
                
                y_pred = decisiontree.predict(X_valid)
                
                acc = accuracy_score(y_valid, y_pred)

                f1 = f1_score(y_valid, y_pred, average='weighted')
                
                row = [crit, split, depth, min_samples, acc, f1]
                
                decisiontree_df.loc[n_samples] = row
                
                n_samples+=1

In [71]:
decisiontree_df.sort_values('f1_score', ascending=False, inplace=True)
decisiontree_df.head()

Unnamed: 0,criterion,splitter,max_depth,min_samples_split,accuracy,f1_score
21,gini,random,,2,0.654709,0.642051
9,gini,best,,2,0.636771,0.620189
45,entropy,random,,2,0.61435,0.591905
10,gini,best,,5,0.560538,0.553284
33,entropy,best,,2,0.578475,0.547227


In [72]:
best = decisiontree_df.iloc[0,:]
best

criterion                gini
splitter               random
max_depth                None
min_samples_split           2
accuracy             0.654709
f1_score             0.642051
Name: 21, dtype: object

In [73]:
decisiontree = tree.DecisionTreeClassifier(splitter = best.splitter, max_depth=best.max_depth, criterion=best.criterion, min_samples_split=best.min_samples_split)

In [74]:
decisiontree.fit(X_train, y_train)

DecisionTreeClassifier(splitter='random')

In [75]:
y_pred = decisiontree.predict(X_test)

In [76]:
acc_decision = round(accuracy_score(y_test, y_pred), 2)
acc_decision

0.57

In [77]:
f1_decision = round(f1_score(y_test, y_pred, average='weighted'), 2)
f1_decision

0.54

In [78]:
models_df.loc['Decision Tree'] = [f1_decision, acc_decision]

## SVM

In [79]:
kernels = ['linear', 'rbf', 'poly']
regularization_strength = [0.0001, 0.001, 0.01, 0.1, 1, 10]

In [80]:
svm_df = pd.DataFrame(columns = ['kernel', 'c', 'accuracy', 'f1_score'])

In [81]:
n_samples = 0
for kernel in kernels:
    for c in regularization_strength:
        svm = SVC(kernel=kernel, C=c)

        svm.fit(X_train, y_train)
                
        y_pred = svm.predict(X_valid)
                
        acc = accuracy_score(y_valid, y_pred)

        f1 = f1_score(y_valid, y_pred, average='weighted')
                
        row = [kernel, c, acc, f1]
                
        svm_df.loc[n_samples] = row
                
        n_samples+=1

In [82]:
svm_df.sort_values('f1_score', ascending=False, inplace=True)
svm_df.head()

Unnamed: 0,kernel,c,accuracy,f1_score
17,poly,10.0,0.636771,0.651125
4,linear,1.0,0.641256,0.623936
5,linear,10.0,0.641256,0.623936
3,linear,0.1,0.641256,0.622146
11,rbf,10.0,0.609865,0.594561


In [83]:
best = svm_df.iloc[0,:]
best

kernel          poly
c               10.0
accuracy    0.636771
f1_score    0.651125
Name: 17, dtype: object

In [84]:
svm = SVC(kernel=best.kernel, C=best.c)

In [85]:
svm.fit(X_train, y_train)

SVC(C=10.0, kernel='poly')

In [86]:
y_pred = svm.predict(X_test)

In [87]:
acc_svc = round(accuracy_score(y_test, y_pred), 2)
acc_svc

0.65

In [88]:
f1_svc = round(f1_score(y_test, y_pred, average='weighted'), 2)
f1_svc

0.65

In [89]:
models_df.loc['SVM'] = [f1_svc, acc_svc]

## SGD

In [151]:
loss = ['hinge', 'log', 'modified_huber', 'perceptron']
penalty = ['l2', 'l1', 'elasticnet']
alpha = [0.0001, 0.001, 0.01, 0.1]
learning_rate = ['constant', 'optimal', 'invscaling']
max_iter = [1000, 10000]

In [152]:
sgd_df = pd.DataFrame(columns = ['loss', 'penalty', 'alpha', 'learning_rate', 'max_iter', 'accuracy', 'f1_score'])

In [153]:
n_samples = 0
for l in loss:
    for p in penalty:
        for a in alpha:
            for lr in learning_rate:
                for it in max_iter:
                    sgd = SGDClassifier(loss=l, penalty=p,
                                            alpha=a, max_iter=it,
                                            learning_rate=lr,
                                            eta0=0.01)

                    sgd.fit(X_train, y_train)

                    y_pred = sgd.predict(X_valid)

                    acc = accuracy_score(y_valid, y_pred)

                    f1 = f1_score(y_valid, y_pred, average='weighted')

                    row = [l, p, a, lr, it, acc, f1]

                    sgd_df.loc[n_samples] = row

                    n_samples+=1



In [155]:
sgd_df.sort_values('f1_score', ascending=False, inplace=True)
sgd_df.head()

Unnamed: 0,loss,penalty,alpha,learning_rate,max_iter,accuracy,f1_score
99,log,l1,0.0001,optimal,10000,0.636771,0.62822
26,hinge,l1,0.0001,optimal,1000,0.627803,0.623402
170,modified_huber,l1,0.0001,optimal,1000,0.632287,0.621523
243,perceptron,l1,0.0001,optimal,10000,0.627803,0.619234
98,log,l1,0.0001,optimal,1000,0.618834,0.611302


In [156]:
best = sgd_df.iloc[0,:]
best

loss                  log
penalty                l1
alpha              0.0001
learning_rate     optimal
max_iter            10000
accuracy         0.636771
f1_score          0.62822
Name: 99, dtype: object

In [157]:
sgd = SGDClassifier(loss=best.loss, penalty=best.penalty, alpha=best.alpha, max_iter=best.max_iter, learning_rate=best.learning_rate, eta0=0.01)

In [158]:
sgd.fit(X_train, y_train)

SGDClassifier(eta0=0.01, loss='log', max_iter=10000, penalty='l1')

In [159]:
y_pred = sgd.predict(X_test)

In [160]:
acc_sgd = round(accuracy_score(y_test, y_pred), 2)
acc_sgd

0.62

In [161]:
f1_sgd = round(f1_score(y_test, y_pred, average='weighted'), 2)
f1_sgd

0.6

In [162]:
models_df.loc['SGD'] = [f1_sgd, acc_sgd]

# Analyze results

In [163]:
models_df.sort_values('F1-score', ascending=False, inplace=True)
models_df

Unnamed: 0,Accuracy,F1-score
Random forest,0.67,0.68
SVM,0.65,0.65
SGD,0.6,0.62
K-Nearest Neighbors,0.58,0.61
Logistic Regression,0.58,0.61
Decision Tree,0.54,0.57
Gaussian NB,0.19,0.22


# PCA

In [164]:
pca = PCA()

In [165]:
pca.fit(X_combined)

PCA()

In [166]:
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

In [207]:
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
n_components_95

94

In [208]:
pca = PCA(n_components=n_components_95)

In [209]:
X_reduced = pca.fit_transform(X_combined, y_combined)
X_reduced.shape

(1484, 94)

In [210]:
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X_reduced, y_combined, test_size=0.3)

Let's run the best model (Random forest) in reduced dataset

In [211]:
best = randomforest_df.iloc[0,:]
best

n_estimators            10000
criterion             entropy
max_depth                None
min_samples_split           5
max_features             auto
bootstrap               False
accuracy             0.695067
f1_score             0.698314
Name: 421, dtype: object

In [212]:
forest = RandomForestClassifier(n_estimators=best.n_estimators, max_depth=best.max_depth,
                                criterion=best.criterion, min_samples_split=best.min_samples_split,
                                max_features=best.max_features, bootstrap=best.bootstrap)

In [213]:
forest.fit(X_train_reduced, y_train_reduced)

RandomForestClassifier(bootstrap=False, criterion='entropy',
                       min_samples_split=5, n_estimators=10000)

In [214]:
y_pred = forest.predict(X_test_reduced)

In [215]:
f1_rf_reduced = round(f1_score(y_test_reduced, y_pred, average='weighted'), 2)
f1_rf_reduced

0.71

In [216]:
acc_rf_reduced = round(accuracy_score(y_test_reduced, y_pred), 2)
acc_rf_reduced

0.7

In [217]:
print('Random Forest after PCA')
print(f'F1-score: {f1_rf_reduced}')
print(f'Accuracy: {acc_rf_reduced}')

Random Forest after PCA
F1-score: 0.71
Accuracy: 0.7


# Feature selection

In [182]:
lr_feat_sel = LogisticRegression(penalty='l1', solver='liblinear')

In [183]:
lr_feat_sel.fit(X_combined, y_combined)

LogisticRegression(penalty='l1', solver='liblinear')

In [184]:
selector = SelectFromModel(lr_feat_sel, prefit=True)
X_selected = selector.transform(X_combined)
X_selected.shape

(1484, 889)

In [186]:
X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(X_selected, y_combined, test_size=0.3)

In [187]:
best = randomforest_df.iloc[0,:]

In [188]:
forest = RandomForestClassifier(n_estimators=best.n_estimators, max_depth=best.max_depth,
                                criterion=best.criterion, min_samples_split=best.min_samples_split)

In [189]:
forest.fit(X_train_selected, y_train_selected)

RandomForestClassifier(criterion='entropy', min_samples_split=5,
                       n_estimators=10000)

In [191]:
y_pred = forest.predict(X_test_selected)

In [193]:
f1_rf_selected = round(f1_score(y_test_selected, y_pred, average='weighted'), 2)
f1_rf_selected

0.73

In [194]:
acc_rf_selected = round(accuracy_score(y_test_selected, y_pred), 2)
acc_rf_selected

0.73

In [195]:
print('Random Forest after Feature Selection')
print(f'F1-score: {f1_rf_selected}')
print(f'Accuracy: {acc_rf_selected}')

Random Forest after Feature Selection
F1-score: 0.73
Accuracy: 0.73
