# Day 09. Exercise 00
# Regularization

## 0. Imports

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv("../data/dayofweek.csv", index_col=0)
df.head()

Unnamed: 0,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,...,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,numTrials,hour,dayofweek
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.788667,-2.562352,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.756764,-2.562352,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.724861,-2.562352,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.692958,-2.562352,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.661055,-2.562352,4


In [113]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21, stratify=y, test_size=0.2)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [114]:
def cv(model, X, y):

    accuracy = []
    kf = KFold(n_splits=10)
    for train_ind, test_ind in kf.split(X):
        X_train, X_val = X.iloc[train_ind], X.iloc[test_ind]
        y_train, y_val = y.iloc[train_ind], y.iloc[test_ind]

        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        y_train_pred = model.predict(X_train)
        train_acc = accuracy_score(y_train, y_train_pred)
        val_acc = accuracy_score(y_val, y_val_pred)
        accuracy.append(val_acc)

        print(f"train -  {train_acc}  |   valid -  {val_acc}")
    print(f"Average accuracy on crossval is {np.mean(accuracy)}")
    print(f"Std is {np.std(accuracy)}")


In [115]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False)
cv(logreg, X_train, y_train)

train -  0.6215993404781534  |   valid -  0.6222222222222222
train -  0.6504534212695795  |   valid -  0.6222222222222222
train -  0.6446826051112943  |   valid -  0.5555555555555556
train -  0.651277823577906  |   valid -  0.5851851851851851
train -  0.6306677658697445  |   valid -  0.6148148148148148
train -  0.6496290189612531  |   valid -  0.6370370370370371
train -  0.6232481450948063  |   valid -  0.6148148148148148
train -  0.6628194558944766  |   valid -  0.5333333333333333
train -  0.6293245469522241  |   valid -  0.6492537313432836
train -  0.6507413509060955  |   valid -  0.5895522388059702
Average accuracy on crossval is 0.6023991155334439
Std is 0.0345182762397802
CPU times: user 216 ms, sys: 3.96 ms, total: 220 ms
Wall time: 219 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [116]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, solver='saga', penalty='l1', max_iter=1000)
cv(logreg, X_train, y_train)

train -  0.6331409727947238  |   valid -  0.6148148148148148
train -  0.642209398186315  |   valid -  0.6148148148148148
train -  0.6471558120362737  |   valid -  0.5481481481481482
train -  0.6537510305028854  |   valid -  0.5777777777777777
train -  0.6339653751030503  |   valid -  0.5925925925925926
train -  0.6455070074196207  |   valid -  0.6444444444444445
train -  0.6224237427864798  |   valid -  0.6148148148148148
train -  0.6636438582028029  |   valid -  0.5333333333333333
train -  0.6243822075782537  |   valid -  0.6343283582089553
train -  0.6507413509060955  |   valid -  0.5895522388059702
Average accuracy on crossval is 0.5964621337755666
Std is 0.03387909031089976
CPU times: user 3.66 s, sys: 2.9 ms, total: 3.67 s
Wall time: 3.7 s


In [117]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2')
cv(logreg, X_train, y_train)

train -  0.6215993404781534  |   valid -  0.6222222222222222
train -  0.6504534212695795  |   valid -  0.6222222222222222
train -  0.6446826051112943  |   valid -  0.5555555555555556
train -  0.651277823577906  |   valid -  0.5851851851851851
train -  0.6306677658697445  |   valid -  0.6148148148148148
train -  0.6496290189612531  |   valid -  0.6370370370370371
train -  0.6232481450948063  |   valid -  0.6148148148148148
train -  0.6628194558944766  |   valid -  0.5333333333333333
train -  0.6293245469522241  |   valid -  0.6492537313432836
train -  0.6507413509060955  |   valid -  0.5895522388059702
Average accuracy on crossval is 0.6023991155334439
Std is 0.0345182762397802
CPU times: user 221 ms, sys: 1.99 ms, total: 223 ms
Wall time: 224 ms


In [118]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, max_iter=1000)
cv(logreg, X_train, y_train)

train -  0.6578730420445177  |   valid -  0.6518518518518519
train -  0.6611706512778236  |   valid -  0.6444444444444445
train -  0.6735366859027205  |   valid -  0.5703703703703704
train -  0.6694146743610883  |   valid -  0.6222222222222222
train -  0.6570486397361912  |   valid -  0.6666666666666666
train -  0.6702390766694146  |   valid -  0.6370370370370371
train -  0.642209398186315  |   valid -  0.6296296296296297
train -  0.686727122835944  |   valid -  0.5777777777777777
train -  0.6499176276771005  |   valid -  0.6791044776119403
train -  0.6680395387149918  |   valid -  0.5671641791044776
Average accuracy on crossval is 0.6246268656716418
Std is 0.03807161455518351
CPU times: user 497 ms, sys: 3.97 ms, total: 501 ms
Wall time: 500 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [119]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21)
cv(svc, X_train, y_train)

train -  0.6957955482275351  |   valid -  0.6962962962962963
train -  0.7056883759274526  |   valid -  0.7037037037037037
train -  0.7065127782357791  |   valid -  0.6222222222222222
train -  0.7056883759274526  |   valid -  0.6370370370370371
train -  0.6941467436108821  |   valid -  0.6962962962962963
train -  0.6941467436108821  |   valid -  0.6518518518518519
train -  0.6916735366859027  |   valid -  0.725925925925926
train -  0.708161582852432  |   valid -  0.6
train -  0.6935749588138386  |   valid -  0.7014925373134329
train -  0.7182866556836903  |   valid -  0.582089552238806
Average accuracy on crossval is 0.6616915422885572
Std is 0.04719638707178252
CPU times: user 2.46 s, sys: 4.86 ms, total: 2.46 s
Wall time: 2.45 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [120]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21, C=2)
cv(svc, X_train, y_train)

train -  0.7007419620774938  |   valid -  0.7037037037037037
train -  0.718878812860676  |   valid -  0.7111111111111111
train -  0.7427864798021434  |   valid -  0.6518518518518519
train -  0.7155812036273702  |   valid -  0.6592592592592592
train -  0.7073371805441055  |   valid -  0.6962962962962963
train -  0.7197032151690025  |   valid -  0.6592592592592592
train -  0.7032151690024732  |   valid -  0.7037037037037037
train -  0.7106347897774113  |   valid -  0.6370370370370371
train -  0.7372322899505767  |   valid -  0.7089552238805971
train -  0.729818780889621  |   valid -  0.5970149253731343
Average accuracy on crossval is 0.6728192371475954
Std is 0.03614552085548642
CPU times: user 2.7 s, sys: 1.93 ms, total: 2.7 s
Wall time: 2.7 s


In [121]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21, C=3)
cv(svc, X_train, y_train)

train -  0.7114591920857378  |   valid -  0.674074074074074
train -  0.7502061005770816  |   valid -  0.762962962962963
train -  0.7592745259686727  |   valid -  0.6666666666666666
train -  0.731244847485573  |   valid -  0.6592592592592592
train -  0.7122835943940643  |   valid -  0.7037037037037037
train -  0.7254740313272877  |   valid -  0.6592592592592592
train -  0.720527617477329  |   valid -  0.7111111111111111
train -  0.7370156636438582  |   valid -  0.6592592592592592
train -  0.7528830313014827  |   valid -  0.7164179104477612
train -  0.7364085667215815  |   valid -  0.5895522388059702
Average accuracy on crossval is 0.6802266445550028
Std is 0.044071076015416655
CPU times: user 2.84 s, sys: 2.9 ms, total: 2.84 s
Wall time: 2.85 s


In [122]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21, C=6)
cv(svc, X_train, y_train)

train -  0.7180544105523495  |   valid -  0.6888888888888889
train -  0.7600989282769992  |   valid -  0.7555555555555555
train -  0.7807089859851608  |   valid -  0.7111111111111111
train -  0.7600989282769992  |   valid -  0.7037037037037037
train -  0.7246496290189612  |   valid -  0.7555555555555555
train -  0.7551525144270403  |   valid -  0.7111111111111111
train -  0.72959604286892  |   valid -  0.6888888888888889
train -  0.7551525144270403  |   valid -  0.674074074074074
train -  0.7742998352553542  |   valid -  0.7388059701492538
train -  0.7611202635914333  |   valid -  0.6194029850746269
Average accuracy on crossval is 0.704709784411277
Std is 0.03890051306217774
CPU times: user 3.28 s, sys: 2.87 ms, total: 3.28 s
Wall time: 3.28 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [123]:
%%time
tree = DecisionTreeClassifier(max_depth=10, random_state=21)
cv(tree, X_train, y_train)

train -  0.807914262159934  |   valid -  0.762962962962963
train -  0.7848309975267931  |   valid -  0.7481481481481481
train -  0.8153338829348722  |   valid -  0.7111111111111111
train -  0.7897774113767518  |   valid -  0.7333333333333333
train -  0.8120362737015664  |   valid -  0.762962962962963
train -  0.8046166529266282  |   valid -  0.6962962962962963
train -  0.7930750206100577  |   valid -  0.762962962962963
train -  0.7922506183017313  |   valid -  0.6296296296296297
train -  0.8047775947281713  |   valid -  0.7985074626865671
train -  0.8080724876441515  |   valid -  0.6567164179104478
Average accuracy on crossval is 0.7262631288004423
Std is 0.05009110896487229
CPU times: user 76.2 ms, sys: 1 ms, total: 77.2 ms
Wall time: 77.8 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [124]:
%%time
tree = DecisionTreeClassifier(max_depth=15, random_state=21)
cv(tree, X_train, y_train)

train -  0.957131079967024  |   valid -  0.837037037037037
train -  0.9274525968672712  |   valid -  0.837037037037037
train -  0.9439406430338005  |   valid -  0.8592592592592593
train -  0.9414674361088211  |   valid -  0.8296296296296296
train -  0.9497114591920858  |   valid -  0.8962962962962963
train -  0.9530090684253916  |   valid -  0.8518518518518519
train -  0.9406430338004946  |   valid -  0.8740740740740741
train -  0.9521846661170651  |   valid -  0.8592592592592593
train -  0.9530477759472817  |   valid -  0.8582089552238806
train -  0.943986820428336  |   valid -  0.8582089552238806
Average accuracy on crossval is 0.8560862354892207
Std is 0.01847480794659924
CPU times: user 82.7 ms, sys: 1.01 ms, total: 83.7 ms
Wall time: 82.9 ms


In [125]:
%%time
tree = DecisionTreeClassifier(max_depth=20, random_state=21)
cv(tree, X_train, y_train)

train -  0.989282769991756  |   valid -  0.8666666666666667
train -  0.9859851607584501  |   valid -  0.8888888888888888
train -  0.9868095630667766  |   valid -  0.8444444444444444
train -  0.9851607584501236  |   valid -  0.8740740740740741
train -  0.9925803792250618  |   valid -  0.8962962962962963
train -  0.9884583676834295  |   valid -  0.9259259259259259
train -  0.9859851607584501  |   valid -  0.9037037037037037
train -  0.9917559769167353  |   valid -  0.8962962962962963
train -  0.9917627677100495  |   valid -  0.8955223880597015
train -  0.9876441515650741  |   valid -  0.8805970149253731
Average accuracy on crossval is 0.8872415699281371
Std is 0.021154660591549572
CPU times: user 101 ms, sys: 1.01 ms, total: 103 ms
Wall time: 114 ms


In [126]:
%%time
tree = DecisionTreeClassifier(max_depth=25, random_state=21)
cv(tree, X_train, y_train)

train -  1.0  |   valid -  0.8666666666666667
train -  1.0  |   valid -  0.8814814814814815
train -  0.998351195383347  |   valid -  0.8592592592592593
train -  0.9975267930750206  |   valid -  0.8518518518518519
train -  0.9991755976916735  |   valid -  0.9037037037037037
train -  0.9991755976916735  |   valid -  0.9037037037037037
train -  0.998351195383347  |   valid -  0.9037037037037037
train -  1.0  |   valid -  0.8814814814814815
train -  1.0  |   valid -  0.8955223880597015
train -  1.0  |   valid -  0.8805970149253731
Average accuracy on crossval is 0.8827971254836926
Std is 0.018036968922696572
CPU times: user 82.5 ms, sys: 27 μs, total: 82.5 ms
Wall time: 81.6 ms


In [127]:
%%time
tree = DecisionTreeClassifier(max_depth=20, random_state=21, min_samples_split=6)
cv(tree, X_train, y_train)

train -  0.964550700741962  |   valid -  0.837037037037037
train -  0.9670239076669415  |   valid -  0.8666666666666667
train -  0.964550700741962  |   valid -  0.8518518518518519
train -  0.9653751030502885  |   valid -  0.8666666666666667
train -  0.9629018961253092  |   valid -  0.8740740740740741
train -  0.9612530915086562  |   valid -  0.8888888888888888
train -  0.9587798845836768  |   valid -  0.8962962962962963
train -  0.9629018961253092  |   valid -  0.8888888888888888
train -  0.9678747940691927  |   valid -  0.8805970149253731
train -  0.9662273476112027  |   valid -  0.8731343283582089
Average accuracy on crossval is 0.8724101713653953
Std is 0.01709923809876975
CPU times: user 76.3 ms, sys: 2.01 ms, total: 78.3 ms
Wall time: 77.9 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [128]:
%%time
rf = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
cv(rf, X_train, y_train)

train -  0.9694971145919209  |   valid -  0.8962962962962963
train -  0.9637262984336357  |   valid -  0.9259259259259259
train -  0.966199505358615  |   valid -  0.8888888888888888
train -  0.9711459192085737  |   valid -  0.8888888888888888
train -  0.9670239076669415  |   valid -  0.8888888888888888
train -  0.9703215169002474  |   valid -  0.8666666666666667
train -  0.9727947238252267  |   valid -  0.9037037037037037
train -  0.9736191261335532  |   valid -  0.8814814814814815
train -  0.9736408566721582  |   valid -  0.917910447761194
train -  0.9736408566721582  |   valid -  0.835820895522388
Average accuracy on crossval is 0.8894472084024324
Std is 0.024187048130158796
CPU times: user 850 ms, sys: 3.07 ms, total: 853 ms
Wall time: 849 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [129]:
%%time
rf = RandomForestClassifier(n_estimators=50, max_depth=16, random_state=21)
cv(rf, X_train, y_train)

train -  0.9851607584501236  |   valid -  0.8962962962962963
train -  0.9868095630667766  |   valid -  0.9407407407407408
train -  0.989282769991756  |   valid -  0.8888888888888888
train -  0.9835119538334708  |   valid -  0.8962962962962963
train -  0.9868095630667766  |   valid -  0.9259259259259259
train -  0.9934047815333883  |   valid -  0.8814814814814815
train -  0.9859851607584501  |   valid -  0.9111111111111111
train -  0.989282769991756  |   valid -  0.8814814814814815
train -  0.985172981878089  |   valid -  0.917910447761194
train -  0.9802306425041186  |   valid -  0.8507462686567164
Average accuracy on crossval is 0.8990878938640133
Std is 0.024557230878437038
CPU times: user 863 ms, sys: 5.99 ms, total: 869 ms
Wall time: 868 ms


In [130]:
%%time
rf = RandomForestClassifier(n_estimators=50, max_depth=25, random_state=21)
cv(rf, X_train, y_train)

train -  1.0  |   valid -  0.9037037037037037
train -  1.0  |   valid -  0.9333333333333333
train -  0.998351195383347  |   valid -  0.9037037037037037
train -  1.0  |   valid -  0.8962962962962963
train -  0.9975267930750206  |   valid -  0.9259259259259259
train -  0.9991755976916735  |   valid -  0.9037037037037037
train -  1.0  |   valid -  0.9111111111111111
train -  1.0  |   valid -  0.9037037037037037
train -  1.0  |   valid -  0.9328358208955224
train -  0.999176276771005  |   valid -  0.8731343283582089
Average accuracy on crossval is 0.9087451630735213
Std is 0.01735075017628308
CPU times: user 970 ms, sys: 11.9 ms, total: 982 ms
Wall time: 986 ms


In [131]:
%%time
rf = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21)
cv(rf, X_train, y_train)

train -  1.0  |   valid -  0.9037037037037037
train -  1.0  |   valid -  0.9259259259259259
train -  0.9991755976916735  |   valid -  0.9037037037037037
train -  0.9991755976916735  |   valid -  0.9037037037037037
train -  0.998351195383347  |   valid -  0.9259259259259259
train -  0.998351195383347  |   valid -  0.9037037037037037
train -  0.9991755976916735  |   valid -  0.9111111111111111
train -  1.0  |   valid -  0.9111111111111111
train -  1.0  |   valid -  0.9253731343283582
train -  1.0  |   valid -  0.8880597014925373
Average accuracy on crossval is 0.9102321724709783
Std is 0.011770657051809642
CPU times: user 2.06 s, sys: 9.94 ms, total: 2.07 s
Wall time: 2.08 s


In [132]:
%%time
rf = RandomForestClassifier(n_estimators=150, max_depth=14, random_state=21)
cv(rf, X_train, y_train)

train -  0.964550700741962  |   valid -  0.8888888888888888
train -  0.964550700741962  |   valid -  0.9185185185185185
train -  0.9744435284418796  |   valid -  0.8888888888888888
train -  0.9769167353668591  |   valid -  0.8814814814814815
train -  0.9678483099752679  |   valid -  0.9037037037037037
train -  0.9694971145919209  |   valid -  0.8592592592592593
train -  0.9752679307502061  |   valid -  0.8962962962962963
train -  0.9744435284418796  |   valid -  0.8740740740740741
train -  0.9728171334431631  |   valid -  0.9029850746268657
train -  0.971169686985173  |   valid -  0.8507462686567164
Average accuracy on crossval is 0.8864842454394694
Std is 0.019749090050729168
CPU times: user 2.67 s, sys: 7.92 ms, total: 2.68 s
Wall time: 2.68 s


In [133]:
%%time
rf = RandomForestClassifier(n_estimators=25, max_depth=14, random_state=21)
cv(rf, X_train, y_train)

train -  0.9530090684253916  |   valid -  0.8814814814814815
train -  0.9488870568837593  |   valid -  0.9037037037037037
train -  0.9431162407254741  |   valid -  0.8666666666666667
train -  0.9604286892003298  |   valid -  0.8592592592592593
train -  0.9629018961253092  |   valid -  0.9037037037037037
train -  0.9563066776586975  |   valid -  0.837037037037037
train -  0.9612530915086562  |   valid -  0.8962962962962963
train -  0.9703215169002474  |   valid -  0.8666666666666667
train -  0.9728171334431631  |   valid -  0.9104477611940298
train -  0.9637561779242174  |   valid -  0.835820895522388
Average accuracy on crossval is 0.8761083471531232
Std is 0.025981637906136886
CPU times: user 489 ms, sys: 4 ms, total: 493 ms
Wall time: 493 ms


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [134]:
best_model = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21).fit(X_train, y_train)
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9319526627218935

In [135]:
dict_ = {}


for i in range(len(y_test)):
    if y_test.iloc[i] != y_pred[i]:
        dict_[y_test.iloc[i]] = dict_.get(y_test.iloc[i], 0) + 1
dict_

{np.int64(1): 5,
 np.int64(5): 3,
 np.int64(4): 3,
 np.int64(3): 2,
 np.int64(0): 7,
 np.int64(2): 2,
 np.int64(6): 1}

In [136]:
sorted_errors = sorted(dict_.items(), key= lambda item: item[1], reverse=True)

In [137]:
y_test.value_counts()

dayofweek
3    80
6    71
1    55
5    54
2    30
0    27
4    21
Name: count, dtype: int64

In [141]:
print(f"most errors - weekday {int(sorted_errors[0][0])}, percetage = {sorted_errors[0][1]/y_test.value_counts()[0]}")

most errors - weekday 0, percetage = 0.25925925925925924


In [142]:
from joblib import dump
dump(best_model, 'model.joblib', compress=9)

['model.joblib']