# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection \
    import train_test_split, GridSearchCV, cross_val_score, ParameterGrid
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [5]:
df = pd.read_csv('../data/dayofweek-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
X = df.drop(columns=['dayofweek'])
y = df.dayofweek

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=.2,
                                                    random_state=21,
                                                    stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [8]:
parameters = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [.01, .1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

cls_svc = SVC(probability=True, random_state=21)
clf = GridSearchCV(cls_svc, parameters, n_jobs=-1, scoring='accuracy').fit(X_train, y_train)
clf.best_params_

{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

In [9]:
cv_results = pd.DataFrame(clf.cv_results_).set_index('rank_test_score').sort_index()
# cv_results.head(15)
cv_results.head()

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.55257,0.006706,0.038345,0.001143,10,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.9,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419
2,0.561443,0.007748,0.036995,0.002214,10,balanced,auto,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.8635,0.01087
3,0.52272,0.017769,0.038577,0.000515,5,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116
4,0.539723,0.007726,0.040516,0.003799,5,balanced,auto,rbf,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.802974,0.808608,0.021007
5,37.734621,3.523031,0.01272,0.004298,10,balanced,auto,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.72963,0.7,0.755556,0.754647,0.665428,0.721052,0.034438


> There's a significant difference in terms of accuracy and fit time combined

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [10]:
parameters = {
    'max_depth': range(1, 49),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini'],
}
clf_tree_ = DecisionTreeClassifier(random_state=21)
clf = GridSearchCV(clf_tree_, parameters, n_jobs=-1, scoring='accuracy').fit(X_train, y_train)
clf.best_params_

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}

In [11]:
cv_results = pd.DataFrame(clf.cv_results_).set_index('rank_test_score').sort_index()
# cv_results.head(15)
cv_results.head()

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.004238,0.001597,0.001044,0.0006,balanced,gini,22,"{'class_weight': 'balanced', 'criterion': 'gin...",0.885185,0.862963,0.903704,0.881041,0.832714,0.873121,0.023998
2,0.003743,0.001704,0.000849,0.000333,balanced,gini,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.859259,0.903704,0.884758,0.828996,0.873121,0.0263
3,0.002906,8.7e-05,0.000716,1e-05,balanced,gini,48,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.832714,0.873116,0.023911
3,0.003876,0.001542,0.001,0.00052,balanced,gini,46,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.832714,0.873116,0.023911
3,0.004624,0.002128,0.001014,0.000471,balanced,gini,45,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.832714,0.873116,0.023911


> There's no significant difference neither in terms of time nor in accuracy score

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [12]:
parameters = {
    'max_depth': range(1, 49),
    'n_estimators': [5, 10, 50, 100],
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini'],
}
clf_forest_ = RandomForestClassifier(random_state=21)
clf = GridSearchCV(clf_forest_, parameters, n_jobs=-1, scoring='accuracy').fit(X_train, y_train)
clf.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50}

In [13]:
columns = ['rank_test_score', 'mean_fit_time', 'mean_score_time',
           'param_class_weight', 'param_criterion', 'param_max_depth', 'param_n_estimators',
           'mean_test_score', 'std_test_score']
cv_results = pd.DataFrame(clf.cv_results_, columns=columns).set_index('rank_test_score').sort_index()
cv_results.head(15)
# cv_results.head()

Unnamed: 0_level_0,mean_fit_time,mean_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.071462,0.004372,,gini,28,50,0.90429,0.010961
2,0.139701,0.010035,,gini,31,100,0.903547,0.01438
3,0.068241,0.004562,balanced,gini,30,50,0.902817,0.013554
4,0.071291,0.004255,balanced,gini,34,50,0.902809,0.01301
5,0.130957,0.009804,,gini,48,100,0.902806,0.01046
5,0.13894,0.011017,,gini,36,100,0.902806,0.01046
5,0.13626,0.01161,,gini,37,100,0.902806,0.01046
5,0.139518,0.009353,,gini,38,100,0.902806,0.01046
5,0.139893,0.009061,,gini,39,100,0.902806,0.01046
5,0.138116,0.009704,,gini,40,100,0.902806,0.01046


> There's almost no difference but the first place params take less time and have the lowest possible subparams

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [14]:
params_grid = list(ParameterGrid(parameters))
results = pd.DataFrame({'class_weight': [],
                        'criterion': [],
                        'max_depth': [],
                        'n_estimators': [],
                        'mean_accuracy': [],
                        'std_accuracy': [],
                       })

In [15]:
for params in tqdm(params_grid):
    forest = RandomForestClassifier(random_state=21, n_jobs=-1, **params)
    cross_val_res = cross_val_score(forest, X_train, y_train, n_jobs=-1, cv=5)
    results.loc[len(results.index)] = (list(params.values()) + [cross_val_res.mean(), cross_val_res.std()])
    

  0%|          | 0/768 [00:00<?, ?it/s]

In [19]:
results.sort_values('mean_accuracy', ascending=False, ignore_index=True)

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,mean_accuracy,std_accuracy
0,,gini,28,50,0.904290,0.010961
1,,gini,31,100,0.903547,0.014380
2,balanced,gini,30,50,0.902817,0.013554
3,balanced,gini,34,50,0.902809,0.013010
4,,gini,41,100,0.902806,0.010460
...,...,...,...,...,...,...
763,,entropy,1,5,0.353832,0.016467
764,balanced,entropy,2,5,0.353110,0.021165
765,balanced,gini,2,5,0.346419,0.029749
766,balanced,gini,1,5,0.283390,0.011062


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [20]:
final_forest = RandomForestClassifier(random_state=21, n_jobs=-1,
                                      class_weight=None, criterion='gini',
                                      max_depth=28, n_estimators=50)
final_forest.fit(X_train, y_train)
y_pred = final_forest.predict(X_test)
accuracy_score(y_test, y_pred)

0.9289940828402367