# Day 08. Exercise 03
# Overfitting

## 0. Imports

In [None]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
import joblib

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt

## 1. Preprocessing

1. Read the file `dayofweek.csv` to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`.
3. Using, for example, `value_counts()` to check if the distribution of classes is similar in train and test.
4. Use the additional parameter `stratify=` and check the distribution again, now it should be more or less similar in both datasets.

In [None]:
df = pd.read_csv('../data/dayofweek.csv')
df

In [None]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

Используя train_test_split с параметрами test_size=0.2, random_state=21 получаем X_train, y_train, X_test, y_test.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)


Использование, например, value_counts(), чтобы проверить, похоже ли распределение классов в обучении и тестировании.

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

Видим, что после стратификации распределение по выборкам значительно равномернее.

## 2. Baseline models

1. Train exactly the same baseline models from the previous exercise and calculate the accuracies using the test dataset with stratification.
2. Did all the models show the similar values of the metric? Which one has the largest difference comparing the current exercise and the previous? Put the answer to the markdown cell in the end of the section.

### a. Logreg

In [None]:
logreg = LogisticRegression(random_state=21, fit_intercept=False)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy_score(y_test, y_pred)

Точность изменилась. Была 0.6405693950177936

### b. SVM

In [None]:
svc = SVC(kernel='poly', probability=True, random_state=21)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

Точность изменилась. Была 0.8594306049822064

### c. Decision tree

In [None]:
tree = DecisionTreeClassifier(max_depth=10, random_state=21)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred)

Точность изменилась. Была 0.8196915776986952

### d. Random forest

In [None]:
forest = RandomForestClassifier(max_depth=25, n_estimators=100, random_state=21)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
accuracy_score(y_test, y_pred)

Точность изменилась. Была 1.0

Больше всего изменилась точность у дерева решений.

## 3. Crossvalidation

We could play with parameters of the model trying to achive a better accuracy on the test dataset, but it is a bad practice. It leads us again to overfitting. Test dataset is only for checking quality of a final model.

But there is another way of solving the problem – crossvalidation. It does not use test dataset, but creates one more split of train dataset. Again, there are different ways of doing it, but the common thing is that there is a validation dataset that is used for hyperparameters optimization.

1. Using `cross_val_score` with `cv=10` calculate the mean accuracy and standard deviation for every model that you used before (logreg with `solver='liblinear'`, SVC, decision tree, random forest).

Используя cross_val_score с cv=10, рассчитаем среднюю точность и стандартное отклонение для каждой модели, которую использовали ранее.

### a. Logreg

In [None]:
logreg = LogisticRegression(random_state=21)
param_grid = {'fit_intercept': [True, False],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

cv_logreg = GridSearchCV(logreg, param_grid, scoring='accuracy', n_jobs=-1)
cv_logreg.fit(X_train, y_train)
y_pred = cv_logreg.predict(X_test)
accuracy_score(y_test, y_pred)

### b. SVM

In [None]:
svc = SVC(random_state=21)
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'probability': [True, False]}

cv_svc = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1)
cv_svc.fit(X_train, y_train)
y_pred = cv_svc.predict(X_test)
accuracy_score(y_test, y_pred)

### c. Decision tree

In [None]:
tree = DecisionTreeClassifier(random_state=21)
param_grid = {'max_depth': np.arange(3, 21)}

cv_tree = GridSearchCV(tree, param_grid, scoring='accuracy', n_jobs=-1)
cv_tree.fit(X_train, y_train)
cv_tree.best_params_
y_pred = cv_tree.predict(X_test)
accuracy_score(y_test, y_pred)

### d. Random forest

In [None]:
forest = RandomForestClassifier(random_state=21)
param_grid = {'n_estimators': np.arange(10, 200, 10),
              'max_depth': np.arange(3, 21)}

cv_forest = GridSearchCV(forest, param_grid, scoring='accuracy', n_jobs=-1)
cv_forest.fit(X_train, y_train)
cv_forest.best_params_
y_pred = cv_forest.predict(X_test)
accuracy_score(y_test, y_pred)

## 4. Optimization

1. Choose the best model and play a little bit with the parameters on cross-validation, find a good enough parameter or a combination of the parameters.
2. Calculate the accuracy for the final model on the test dataset.
3. Draw a plot that displays the top-10 most  important features for that model.
4. Save the model using `joblib`.
5. Load the model, make predictions for the test dataset and calculate the accuracy.

Выберем лучшую модель и немного поменяем параметры перекрестной проверки.

Рассчитаем точность окончательной модели на тестовом наборе данных.

In [None]:
model = RandomForestClassifier(max_depth=20, n_estimators=60, random_state=21)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

Нарисуем график, отображающий 10 наиболее важных функций этой модели.

In [None]:
plot_feature_importances(model.feature_importances_, X.columns)

Сохраним модель с помощью joblib.

In [None]:
joblib.dump(model, 'best_model.joblib')


Загрузим модель, сделаем прогнозы для тестового набора данных и рассчитаем точность.

In [None]:
best_model = joblib.load('best_model.joblib')
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)