# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
X = df.drop(columns='dayofweek')
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [4]:
estimator = SVC(random_state=21, probability=True)

params = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced'],
}

grid_search = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

In [5]:
results.sort_values('rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
64,0.967703,0.092776,0.028648,0.010347,10,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.9,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419,1
70,0.831982,0.034128,0.022656,0.001647,10,balanced,auto,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.8635,0.01087,2
52,0.874211,0.197651,0.028493,0.008753,5,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
58,0.938389,0.177729,0.029007,0.009963,5,balanced,auto,rbf,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.802974,0.808608,0.021007,4
69,62.558534,8.652536,0.011739,0.00279,10,balanced,auto,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.72963,0.7,0.755556,0.754647,0.665428,0.721052,0.034438,5
66,76.760747,5.743802,0.014662,0.001451,10,balanced,scale,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.72963,0.7,0.755556,0.754647,0.665428,0.721052,0.034438,5
63,65.72623,10.049474,0.014864,0.000154,10,,auto,linear,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.737037,0.711111,0.707407,0.743494,0.698885,0.719587,0.017463,7
60,64.486893,8.618637,0.014797,0.000208,10,,scale,linear,"{'C': 10, 'class_weight': None, 'gamma': 'scal...",0.737037,0.711111,0.707407,0.743494,0.698885,0.719587,0.017463,7
57,46.545499,2.545537,0.016743,0.001155,5,balanced,auto,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.725926,0.692593,0.696296,0.754647,0.66171,0.706234,0.031619,9
54,48.197806,2.654716,0.019264,0.003111,5,balanced,scale,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.725926,0.692593,0.696296,0.754647,0.66171,0.706234,0.031619,9


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [6]:
estimator = DecisionTreeClassifier(random_state=21)

params = {
    'max_depth': np.arange(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

In [7]:
results.sort_values('rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
118,0.012226,0.003154,0.005139,0.002727,balanced,gini,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.859259,0.903704,0.884758,0.832714,0.873865,0.025066,1
122,0.011917,0.003928,0.003123,0.001641,balanced,gini,25,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.874074,0.903704,0.873606,0.828996,0.873854,0.025018,2
119,0.012446,0.005029,0.002159,4.7e-05,balanced,gini,22,"{'class_weight': 'balanced', 'criterion': 'gin...",0.885185,0.862963,0.903704,0.881041,0.828996,0.872378,0.025263,3
134,0.009349,0.001675,0.002422,0.000299,balanced,gini,37,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
141,0.008062,0.000416,0.004549,0.001939,balanced,gini,44,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
140,0.009223,0.002888,0.002213,5.6e-05,balanced,gini,43,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
139,0.007385,0.000208,0.002892,0.001259,balanced,gini,42,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
138,0.007392,0.000266,0.002208,0.000101,balanced,gini,41,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
137,0.011549,0.002771,0.002213,8.7e-05,balanced,gini,40,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
136,0.013856,0.006831,0.004437,0.001891,balanced,gini,39,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [8]:
estimator = RandomForestClassifier(random_state=21)

params = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': np.arange(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

In [9]:
results.sort_values('rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
683,0.26897,0.002483,0.013177,0.000198,balanced,entropy,24,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922222,0.9,0.903704,0.910781,0.884758,0.904293,0.012361,1
110,0.128071,0.007363,0.007562,0.000264,,gini,28,50,"{'class_weight': None, 'criterion': 'gini', 'm...",0.922222,0.9,0.907407,0.903346,0.888476,0.90429,0.010961,2
703,0.300829,0.02261,0.015573,0.002505,balanced,entropy,29,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922222,0.9,0.907407,0.907063,0.884758,0.90429,0.012156,2
510,0.125276,0.001583,0.007576,0.000127,balanced,gini,30,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.903704,0.9,0.907063,0.884758,0.903549,0.012056,4
123,0.241198,0.005638,0.012985,0.000723,,gini,31,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.918519,0.911111,0.9,0.910781,0.877323,0.903547,0.01438,5
687,0.311577,0.025658,0.013134,0.000477,balanced,entropy,25,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922222,0.9,0.9,0.910781,0.881041,0.902809,0.013639,6
522,0.125298,0.001934,0.007802,0.000519,balanced,gini,33,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.925926,0.896296,0.9,0.907063,0.884758,0.902809,0.013628,7
171,0.240414,0.002409,0.012773,0.000232,,gini,43,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.914815,0.911111,0.9,0.903346,0.884758,0.902806,0.01046,8
151,0.250847,0.008243,0.012896,0.000254,,gini,38,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.914815,0.911111,0.9,0.903346,0.884758,0.902806,0.01046,8
155,0.254474,0.01093,0.014329,0.001947,,gini,39,100,"{'class_weight': None, 'criterion': 'gini', 'm...",0.914815,0.911111,0.9,0.903346,0.884758,0.902806,0.01046,8


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [10]:
params = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': np.arange(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'random_state': [21],
}

params_list = list(ParameterGrid(params))
data = []

for params_model in tqdm(params_list):
    estimator = RandomForestClassifier(**params_model)
    cvs = cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=-1)
    info = {**params_model, 'mean_accuracy': cvs.mean(), 'std_accuracy': cvs.std()}
    data.append(info)

100%|██████████| 784/784 [01:35<00:00,  8.20it/s]


In [11]:
results =pd.DataFrame(data)
results.sort_values('mean_accuracy', ascending=False).head(10)

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,random_state,mean_accuracy,std_accuracy
683,balanced,entropy,24,100,21,0.904293,0.012361
703,balanced,entropy,29,100,21,0.90429,0.012156
110,,gini,28,50,21,0.90429,0.010961
510,balanced,gini,30,50,21,0.903549,0.012056
123,,gini,31,100,21,0.903547,0.01438
687,balanced,entropy,25,100,21,0.902809,0.013639
522,balanced,gini,33,50,21,0.902809,0.013628
114,,gini,29,50,21,0.902806,0.011698
171,,gini,43,100,21,0.902806,0.01046
167,,gini,42,100,21,0.902806,0.01046


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [12]:
forest = RandomForestClassifier(random_state=21, n_estimators=100, max_depth=24, criterion='entropy', class_weight='balanced')
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.9260355029585798