# Day 09. Exercise 00
# Regularization

## 0. Imports

In [183]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import joblib

In [73]:
def crossval(Model, X, y, n_splits: int = 10):
    scores = []
    kf = StratifiedKFold(n_splits=n_splits)
    for train, test in kf.split(X, y):
        clf = Model.fit(X.iloc[train], y.iloc[train])
        y_train_pred = clf.predict(X.iloc[train])
        y_test_pred = clf.predict(X.iloc[test])
        
        accuracy_train = accuracy_score(y.iloc[train], y_train_pred)
        accuracy_test = accuracy_score(y.iloc[test], y_test_pred)
        scores.append(accuracy_test)
        print(f'train - {accuracy_train:.5f}',
              f'valid - {accuracy_test:.5f}', sep=' \t|\t')
    print(f'Average accuracy on crossval is {np.mean(scores):.5f}')
    print(f'STD is {np.std(scores):.5f}')

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [56]:
df = pd.read_csv('../data/dayofweek.csv')
df.head()

Unnamed: 0,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,...,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,numTrials,hour,dayofweek
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.788667,-2.562352,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.756764,-2.562352,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.724861,-2.562352,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.692958,-2.562352,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.661055,-2.562352,4


In [57]:
X = df.drop(columns=['dayofweek'])
y = df.dayofweek

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=21,
                                                    stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [78]:
%%time
clf_logreg = LogisticRegression(random_state=21, fit_intercept=False)
crossval(clf_logreg, X_train, y_train)
print() # to add an empty line before %%time

train - 0.62902 	|	valid - 0.59259
train - 0.64633 	|	valid - 0.62963
train - 0.63479 	|	valid - 0.56296
train - 0.65622 	|	valid - 0.61481
train - 0.63397 	|	valid - 0.57778
train - 0.64056 	|	valid - 0.59259
train - 0.64138 	|	valid - 0.65926
train - 0.65952 	|	valid - 0.56296
train - 0.64333 	|	valid - 0.59701
train - 0.63674 	|	valid - 0.62687
Average accuracy on crossval is 0.60165
STD is 0.02943

CPU times: user 2.27 s, sys: 72.8 ms, total: 2.34 s
Wall time: 322 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [82]:
%%time
clf_logreg_none = LogisticRegression(penalty='none', solver='newton-cg',
                                     random_state=21, fit_intercept=False)
crossval(clf_logreg_none, X_train, y_train)
print() # to add an empty line before %%time

train - 0.66694 	|	valid - 0.63704
train - 0.65787 	|	valid - 0.65926
train - 0.66694 	|	valid - 0.57778
train - 0.66529 	|	valid - 0.62963
train - 0.66694 	|	valid - 0.62222
train - 0.65952 	|	valid - 0.57778
train - 0.65045 	|	valid - 0.69630
train - 0.68425 	|	valid - 0.61481
train - 0.66474 	|	valid - 0.62687
train - 0.65651 	|	valid - 0.60448
Average accuracy on crossval is 0.62462
STD is 0.03379

CPU times: user 19.1 s, sys: 468 ms, total: 19.6 s
Wall time: 2.58 s


In [86]:
%%time
clf_logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear',
                                     random_state=21, fit_intercept=False)
crossval(clf_logreg_l1, X_train, y_train)
print()

train - 0.61830 	|	valid - 0.54815
train - 0.62737 	|	valid - 0.62222
train - 0.60511 	|	valid - 0.54074
train - 0.63644 	|	valid - 0.62222
train - 0.62407 	|	valid - 0.55556
train - 0.62325 	|	valid - 0.58519
train - 0.61253 	|	valid - 0.63704
train - 0.64716 	|	valid - 0.58519
train - 0.63015 	|	valid - 0.59701
train - 0.61367 	|	valid - 0.59701
Average accuracy on crossval is 0.58903
STD is 0.03129

CPU times: user 804 ms, sys: 298 ms, total: 1.1 s
Wall time: 177 ms


In [90]:
%%time
clf_logreg_l2_newton = LogisticRegression(penalty='l2', solver='newton-cg',
                                          random_state=21, fit_intercept=False)
crossval(clf_logreg_l2_newton, X_train, y_train)
print()

train - 0.62902 	|	valid - 0.59259
train - 0.64633 	|	valid - 0.62963
train - 0.63479 	|	valid - 0.56296
train - 0.65622 	|	valid - 0.61481
train - 0.63397 	|	valid - 0.57778
train - 0.64056 	|	valid - 0.59259
train - 0.64221 	|	valid - 0.65926
train - 0.65952 	|	valid - 0.56296
train - 0.64333 	|	valid - 0.59701
train - 0.63674 	|	valid - 0.62687
Average accuracy on crossval is 0.60165
STD is 0.02943

CPU times: user 3.59 s, sys: 172 ms, total: 3.76 s
Wall time: 502 ms


In [100]:
parameters = [
    {'solver': ['newton-cg', 'lbfgs', 'sag'], 'penalty': ['l2', 'none']},
    {'solver': ['liblinear'], 'penalty': ['l1', 'l2']},
    {'solver': ['saga'], 'penalty': ['l1', 'l2', 'none']},
]
clf_logreg_ = LogisticRegression(random_state=21, fit_intercept=False, max_iter=2000)
clf = GridSearchCV(clf_logreg_, parameters, n_jobs=-1).fit(X_train, y_train)
clf.best_params_

{'penalty': 'none', 'solver': 'lbfgs'}

In [106]:
%%time
clf_logreg_best = LogisticRegression(penalty='none', solver='lbfgs', max_iter=400,
                                     random_state=21, fit_intercept=False)
crossval(clf_logreg_best, X_train, y_train)
print()

train - 0.66612 	|	valid - 0.63704
train - 0.65787 	|	valid - 0.65926
train - 0.66694 	|	valid - 0.57778
train - 0.66529 	|	valid - 0.62963
train - 0.66777 	|	valid - 0.62222
train - 0.65952 	|	valid - 0.57778
train - 0.65045 	|	valid - 0.69630
train - 0.68425 	|	valid - 0.61481
train - 0.66474 	|	valid - 0.62687
train - 0.65651 	|	valid - 0.60448
Average accuracy on crossval is 0.62462
STD is 0.03379

CPU times: user 9.33 s, sys: 267 ms, total: 9.59 s
Wall time: 1.24 s


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [107]:
cls_svc = SVC(probability=True, kernel='linear', random_state=21)
crossval(cls_svc, X_train, y_train)

train - 0.70486 	|	valid - 0.65926
train - 0.69662 	|	valid - 0.75556
train - 0.69415 	|	valid - 0.62222
train - 0.70239 	|	valid - 0.65185
train - 0.69085 	|	valid - 0.65185
train - 0.68920 	|	valid - 0.64444
train - 0.69250 	|	valid - 0.72593
train - 0.70074 	|	valid - 0.62222
train - 0.69605 	|	valid - 0.61940
train - 0.71087 	|	valid - 0.63433
Average accuracy on crossval is 0.65871
STD is 0.04359


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [130]:
parameters = [
    {'C': range(1, 51)},
]
cls_svc = SVC(probability=True, kernel='linear', random_state=21)
clf = GridSearchCV(cls_svc, parameters, n_jobs=-1).fit(X_train, y_train)
clf.best_params_

{'C': 49}

In [131]:
crossval(SVC(C=49, probability=True, kernel='linear', random_state=21), X_train, y_train)

train - 0.78401 	|	valid - 0.75556
train - 0.79720 	|	valid - 0.83704
train - 0.80709 	|	valid - 0.72593
train - 0.78236 	|	valid - 0.76296
train - 0.78483 	|	valid - 0.77778
train - 0.79637 	|	valid - 0.74074
train - 0.78236 	|	valid - 0.77037
train - 0.79967 	|	valid - 0.72593
train - 0.79325 	|	valid - 0.70896
train - 0.80066 	|	valid - 0.73881
Average accuracy on crossval is 0.75441
STD is 0.03438


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [120]:
clf_tree = DecisionTreeClassifier(max_depth=10, random_state=21)
crossval(clf_tree, X_train, y_train)

train - 0.81039 	|	valid - 0.74815
train - 0.77741 	|	valid - 0.74074
train - 0.83347 	|	valid - 0.70370
train - 0.79720 	|	valid - 0.77037
train - 0.82440 	|	valid - 0.75556
train - 0.80379 	|	valid - 0.68889
train - 0.80709 	|	valid - 0.76296
train - 0.80132 	|	valid - 0.65926
train - 0.80807 	|	valid - 0.74627
train - 0.80478 	|	valid - 0.68657
Average accuracy on crossval is 0.72625
STD is 0.03635


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [139]:
parameters = {
    'max_depth': range(3, 30),
    'min_samples_split': range(2, 15),
    'min_samples_leaf': range(1, 10),
#     'max_leaf_nodes': range(2, 15),
}
clf_tree_ = DecisionTreeClassifier(random_state=21)
clf = GridSearchCV(clf_tree_, parameters, n_jobs=-1).fit(X_train, y_train)
clf.best_params_

{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [141]:
clf_tree_best = DecisionTreeClassifier(max_depth=20,
                                       min_samples_leaf=1,
                                       min_samples_split=2,
                                       random_state=21)
crossval(clf_tree_best, X_train, y_train)

train - 0.98928 	|	valid - 0.86667
train - 0.99011 	|	valid - 0.89630
train - 0.98681 	|	valid - 0.85185
train - 0.98763 	|	valid - 0.90370
train - 0.98928 	|	valid - 0.88148
train - 0.98186 	|	valid - 0.86667
train - 0.98846 	|	valid - 0.91852
train - 0.99093 	|	valid - 0.89630
train - 0.99094 	|	valid - 0.88060
train - 0.98847 	|	valid - 0.88060
Average accuracy on crossval is 0.88427
STD is 0.01883


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [142]:
clf_forest = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
crossval(clf_forest, X_train, y_train)

train - 0.96373 	|	valid - 0.87407
train - 0.97032 	|	valid - 0.91111
train - 0.96867 	|	valid - 0.88889
train - 0.97279 	|	valid - 0.91111
train - 0.96785 	|	valid - 0.91111
train - 0.96620 	|	valid - 0.85185
train - 0.96867 	|	valid - 0.91111
train - 0.96702 	|	valid - 0.85185
train - 0.97199 	|	valid - 0.88060
train - 0.96458 	|	valid - 0.85075
Average accuracy on crossval is 0.88425
STD is 0.02499


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [153]:
parameters = {
    'max_depth': range(10, 30), # tested range(3, 30)
    'n_estimators': range(10, 201, 10), # tested range(10, 301, 10)
#     'min_samples_split': range(2, 15), tested range(2, 15)
#     'min_samples_leaf': range(1, 10),
}
clf_forest_ = RandomForestClassifier(random_state=21)
clf = GridSearchCV(clf_forest_, parameters, n_jobs=-1).fit(X_train, y_train)
clf.best_params_

{'max_depth': 26, 'min_samples_split': 2, 'n_estimators': 100}

In [154]:
clf_forest_ = RandomForestClassifier(n_estimators=100, max_depth=26,
                                     min_samples_split=2, random_state=21)
crossval(clf_forest_, X_train, y_train)

train - 0.99918 	|	valid - 0.89630
train - 0.99918 	|	valid - 0.95556
train - 0.99918 	|	valid - 0.88889
train - 1.00000 	|	valid - 0.94074
train - 0.99918 	|	valid - 0.91852
train - 0.99918 	|	valid - 0.89630
train - 0.99918 	|	valid - 0.91111
train - 1.00000 	|	valid - 0.89630
train - 1.00000 	|	valid - 0.94030
train - 1.00000 	|	valid - 0.89552
Average accuracy on crossval is 0.91395
STD is 0.02253


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [161]:
clf_final = RandomForestClassifier(n_estimators=100, max_depth=26,
                                   min_samples_split=2, random_state=21)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)

accuracy_score(y_test, y_pred)

0.9260355029585798

In [178]:
compare = pd.DataFrame({'y_real': y_test, 'y_pred': y_pred})
stats = compare.y_real[compare['y_real'] != compare['y_pred']].value_counts().sort_index()
stats

0    7
1    4
2    2
3    3
4    3
5    3
6    3
Name: y_real, dtype: int64

In [182]:
mistake = (stats[0] / compare.y_real[compare.y_real == 0].count()) * 100
mistake

25.925925925925924

> For `Mondays` the model makes the most errors

> `26%` of all Mondays were guest wrong

In [184]:
joblib.dump(clf_final, 'RandForestClf.joblib')

['RandForestClf.joblib']