# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

ModuleNotFoundError: No module named 'scipy.sparse'

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [None]:
df = pd.read_csv('../data/dayofweek.csv')
df

In [None]:
x = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [None]:
lr = LogisticRegression(fit_intercept=False, random_state=21)

In [None]:
def crossval(estimator, X, y, n_splits=10):
    train_scores = []
    valid_scores = []
    cv = StratifiedKFold(n_splits=n_splits)
    for train, valid in cv.split(X, y):
        estimator.fit(X.iloc[train], y.iloc[train])
        y_train_pred = estimator.predict(X.iloc[train])
        y_valid_pred = estimator.predict(X.iloc[valid])
        train_scores.append(accuracy_score(y.iloc[train], y_train_pred))
        valid_scores.append(accuracy_score(y.iloc[valid], y_valid_pred))
    for i in range(len(train_scores)):
        print(f'train -  {train_scores[i]:.5f}   |   valid -  {valid_scores[i]:.5f}')
    print(f'Average accuracy on crossval is {np.mean(valid_scores):.5f}')
    print(f'Std is {np.std(valid_scores):.5f}')

In [None]:
%%time
crossval(lr, X_train, y_train)

### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [None]:
param_grid = [
              {'solver': ['liblinear', 'saga'], 'penalty': ['l1', 'l2']},
              {'solver': ['saga'], 'penalty': ['l1', 'l2', 'none']},
              {'solver': ['newton-cg', 'lbfgs', 'sag'], 'penalty': ['l2', 'none']}]

gs = GridSearchCV(lr, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [None]:
svc = SVC(kernel='linear', probability=True, random_state=21)

In [None]:
%%time
crossval(svc, X_train, y_train)

### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [None]:
param_grid = {'C': [15, 40, 60, 100, 150]}
gs = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [None]:
dtc = DecisionTreeClassifier(max_depth=10, random_state=21)

In [None]:
%%time
crossval(dtc, X_train, y_train)

### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
param_grid = {'splitter': ['best', 'random'],
              'max_depth': np.arange(2, 25)}
gs = GridSearchCV(dtc, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [None]:
rfc = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)

In [None]:
%%time
crossval(rfc, X_train, y_train)

### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth': np.arange(15, 30),
              'n_estimators': np.arange(50, 150)}
gs = GridSearchCV(rfc, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [None]:
y_pred = gs.best_estimator_.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
ConfusionMatrixDisplay.from_estimator(gs.best_estimator_, X_test, y_test)

In [None]:
joblib.dump(gs.best_estimator_, '../data/model.joblib')