In [1]:
%matplotlib inline

import numpy as np

import joblib


import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score, f1_score, recall_score,precision_score, classification_report, confusion_matrix
import collections
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

np.random.seed(1337)  # for reproducibility

In [2]:
X_train = np.load('X_train.npy') 

Y_train = np.load('Y_train.npy')

X_test = np.load('X_test.npy')

Y_test = np.load('Y_test.npy')

In [3]:
print(X_train.shape)
print(Y_train.shape)

print(X_test.shape)
print(Y_test.shape)

(28368, 42)
(28368,)
(12168, 42)
(12168,)


In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Grid Search

In [5]:
LR_clf=LogisticRegression(multi_class='multinomial', max_iter=10000)

In [6]:
param_grid={'C': [0.001,0.01,0.1,1,10,100,1000,10000],'solver': ['lbfgs','sag','saga','newton-cg']}

In [7]:
grid_search = GridSearchCV(LR_clf, param_grid, scoring='accuracy', cv = 5, verbose=5,return_train_score=True)

In [8]:
grid_search.fit(X_train,Y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] C=0.001, solver=lbfgs ...........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.001, solver=lbfgs, score=(train=0.979, test=0.979), total=   1.5s
[CV] C=0.001, solver=lbfgs ...........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


[CV]  C=0.001, solver=lbfgs, score=(train=0.978, test=0.979), total=   1.4s
[CV] C=0.001, solver=lbfgs ...........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.9s remaining:    0.0s


[CV]  C=0.001, solver=lbfgs, score=(train=0.980, test=0.976), total=   1.3s
[CV] C=0.001, solver=lbfgs ...........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.3s remaining:    0.0s


[CV]  C=0.001, solver=lbfgs, score=(train=0.979, test=0.978), total=   1.4s
[CV] C=0.001, solver=lbfgs ...........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.7s remaining:    0.0s


[CV]  C=0.001, solver=lbfgs, score=(train=0.978, test=0.981), total=   1.4s
[CV] C=0.001, solver=sag .............................................
[CV]  C=0.001, solver=sag, score=(train=0.979, test=0.979), total=   1.2s
[CV] C=0.001, solver=sag .............................................
[CV]  C=0.001, solver=sag, score=(train=0.978, test=0.979), total=   2.1s
[CV] C=0.001, solver=sag .............................................
[CV]  C=0.001, solver=sag, score=(train=0.980, test=0.976), total=   1.9s
[CV] C=0.001, solver=sag .............................................
[CV]  C=0.001, solver=sag, score=(train=0.979, test=0.978), total=   2.2s
[CV] C=0.001, solver=sag .............................................
[CV]  C=0.001, solver=sag, score=(train=0.978, test=0.981), total=   2.1s
[CV] C=0.001, solver=saga ............................................
[CV]  C=0.001, solver=saga, score=(train=0.979, test=0.979), total=   8.0s
[CV] C=0.001, solver=saga ...........................

[CV]  C=1, solver=lbfgs, score=(train=0.999, test=0.999), total=   5.9s
[CV] C=1, solver=lbfgs ...............................................
[CV]  C=1, solver=lbfgs, score=(train=0.999, test=0.998), total=   5.9s
[CV] C=1, solver=lbfgs ...............................................
[CV]  C=1, solver=lbfgs, score=(train=0.999, test=0.998), total=   6.2s
[CV] C=1, solver=lbfgs ...............................................
[CV]  C=1, solver=lbfgs, score=(train=0.999, test=0.999), total=   6.5s
[CV] C=1, solver=sag .................................................
[CV] . C=1, solver=sag, score=(train=0.999, test=0.998), total=  40.1s
[CV] C=1, solver=sag .................................................
[CV] . C=1, solver=sag, score=(train=0.999, test=0.999), total=  36.8s
[CV] C=1, solver=sag .................................................
[CV] . C=1, solver=sag, score=(train=0.999, test=0.998), total=  42.6s
[CV] C=1, solver=sag .................................................
[C

[CV]  C=100, solver=newton-cg, score=(train=1.000, test=0.999), total=   7.6s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=(train=1.000, test=0.999), total=   7.4s
[CV] C=1000, solver=lbfgs ............................................
[CV]  C=1000, solver=lbfgs, score=(train=1.000, test=0.998), total=  30.6s
[CV] C=1000, solver=lbfgs ............................................
[CV]  C=1000, solver=lbfgs, score=(train=1.000, test=0.999), total=  23.8s
[CV] C=1000, solver=lbfgs ............................................
[CV]  C=1000, solver=lbfgs, score=(train=1.000, test=0.998), total=  19.9s
[CV] C=1000, solver=lbfgs ............................................
[CV]  C=1000, solver=lbfgs, score=(train=1.000, test=0.998), total=  24.6s
[CV] C=1000, solver=lbfgs ............................................
[CV]  C=1000, solver=lbfgs, score=(train=1.000, test=0.999), total=  22.0s
[CV] C=1000, solver=sag ...................

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 398.2min finished


GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=10000,
                                          multi_class='multinomial'),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
                         'solver': ['lbfgs', 'sag', 'saga', 'newton-cg']},
             return_train_score=True, scoring='accuracy', verbose=5)

In [9]:
grid_search.best_score_

0.9987309917964129

In [10]:
grid_search.best_params_

{'C': 100, 'solver': 'saga'}

In [11]:
best_LR_clf=LogisticRegression(multi_class='multinomial', max_iter=10000, C=100, solver='saga',n_jobs=-1)

In [12]:
best_LR_clf.fit(X_train, Y_train)

LogisticRegression(C=100, max_iter=10000, multi_class='multinomial', n_jobs=-1,
                   solver='saga')

In [13]:
Y_predict = best_LR_clf.predict(X_test)

In [14]:
accuracy_score(Y_test, Y_predict)

0.9178994082840237

In [15]:

# save the model to disk

joblib.dump(best_LR_clf, 'LR_emg_clf.pkl')

['LR_emg_clf.pkl']