# Day 09. Exercise 00
# Regularization

## 0. Imports

In [39]:
import pandas as pd
import numpy as np
import warnings
import joblib
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv('../data/dayofweek.csv')

In [5]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [12]:
!%%time
logreg_baseline = LogisticRegression(random_state=21, fit_intercept=False, max_iter=1000)

skf = StratifiedKFold(n_splits=10)

train_scores = []
valid_scores = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    logreg_baseline.fit(X_train_fold, y_train_fold)
    
    t_score = logreg_baseline.score(X_train_fold, y_train_fold)
    v_score = logreg_baseline.score(X_valid_fold, y_valid_fold)
    
    train_scores.append(t_score)
    valid_scores.append(v_score)
    
    print(f"train -  {t_score:.5f}   |   valid -  {v_score:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")

/bin/bash: line 1: fg: no job control
train -  0.62819   |   valid -  0.59259
train -  0.64716   |   valid -  0.62963
train -  0.63479   |   valid -  0.57037
train -  0.65540   |   valid -  0.61481
train -  0.63314   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64221   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63591   |   valid -  0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [25]:
def test_logreg_final(penalty_type):
    p = None if penalty_type == 'none' else penalty_type
    
    model = LogisticRegression(
        random_state=21, 
        fit_intercept=False, 
        penalty=p, 
        solver='saga', 
        max_iter=1000,
        tol=0.1      
    )
    
    skf = StratifiedKFold(n_splits=10)
    valid_scores = []
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=ConvergenceWarning)
        warnings.simplefilter("ignore", category=FutureWarning)
        warnings.simplefilter("ignore", category=UserWarning)
        
        for train_idx, valid_idx in skf.split(X_train, y_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
            model.fit(X_tr, y_tr)
            valid_scores.append(model.score(X_val, y_val))
    
    print(f"Penalty: {penalty_type:4} | Average accuracy: {np.mean(valid_scores):.5f}")


In [26]:
test_logreg_final('none')
test_logreg_final('l1')
test_logreg_final('l2')

Penalty: none | Average accuracy: 0.59940
Penalty: l1   | Average accuracy: 0.58828
Penalty: l2   | Average accuracy: 0.58753


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [27]:
!%%time
svm_baseline = SVC(kernel='linear', probability=True, random_state=21)

skf = StratifiedKFold(n_splits=10)
valid_scores = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    svm_baseline.fit(X_tr, y_tr)
    
    t_score = svm_baseline.score(X_tr, y_tr)
    v_score = svm_baseline.score(X_val, y_val)
    valid_scores.append(v_score)
    
    print(f"train -  {t_score:.5f}   |   valid -  {v_score:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")

/bin/bash: line 1: fg: no job control
train -  0.70486   |   valid -  0.65926
train -  0.69662   |   valid -  0.75556
train -  0.69415   |   valid -  0.62222
train -  0.70239   |   valid -  0.65185
train -  0.69085   |   valid -  0.65185
train -  0.68920   |   valid -  0.64444
train -  0.69250   |   valid -  0.72593
train -  0.70074   |   valid -  0.62222
train -  0.69605   |   valid -  0.61940
train -  0.71087   |   valid -  0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [28]:
c_values = [0.1, 0.5, 1, 5, 10]

for c in c_values:
    model = SVC(kernel='linear', C=c, random_state=21)
    skf = StratifiedKFold(n_splits=10)
    valid_scores = []
    
    for train_idx, valid_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        model.fit(X_tr, y_tr)
        valid_scores.append(model.score(X_val, y_val))
    
    print(f"C = {c:4} | Average accuracy: {np.mean(valid_scores):.5f}")

C =  0.1 | Average accuracy: 0.56230
C =  0.5 | Average accuracy: 0.63349
C =    1 | Average accuracy: 0.65871
C =    5 | Average accuracy: 0.69952
C =   10 | Average accuracy: 0.72771


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [29]:
%%time
tree_baseline = DecisionTreeClassifier(max_depth=10, random_state=21)

skf = StratifiedKFold(n_splits=10)
valid_scores = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    tree_baseline.fit(X_tr, y_tr)
    
    t_score = tree_baseline.score(X_tr, y_tr)
    v_score = tree_baseline.score(X_val, y_val)
    valid_scores.append(v_score)
    
    print(f"train -  {t_score:.5f}   |   valid -  {v_score:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")

train -  0.81039   |   valid -  0.74074
train -  0.77741   |   valid -  0.74074
train -  0.83347   |   valid -  0.70370
train -  0.79720   |   valid -  0.76296
train -  0.82440   |   valid -  0.75556
train -  0.80379   |   valid -  0.68889
train -  0.80709   |   valid -  0.76296
train -  0.80132   |   valid -  0.65926
train -  0.80807   |   valid -  0.75373
train -  0.80478   |   valid -  0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562
CPU times: user 53.1 ms, sys: 2 μs, total: 53.1 ms
Wall time: 52.3 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [30]:
depths = [3, 5, 7, 10, 15, 20]

for d in depths:
    model = DecisionTreeClassifier(max_depth=d, random_state=21)
    skf = StratifiedKFold(n_splits=10)
    valid_scores = []
    train_scores = []
    
    for train_idx, valid_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        
        model.fit(X_tr, y_tr)
        train_scores.append(model.score(X_tr, y_tr))
        valid_scores.append(model.score(X_val, y_val))
    
    print(f"Max Depth: {d:2} | Train: {np.mean(train_scores):.5f} | Valid: {np.mean(valid_scores):.5f}")

Max Depth:  3 | Train: 0.49069 | Valid: 0.46140
Max Depth:  5 | Train: 0.58984 | Valid: 0.54301
Max Depth:  7 | Train: 0.69214 | Valid: 0.64989
Max Depth: 10 | Train: 0.80679 | Valid: 0.72551
Max Depth: 15 | Train: 0.94972 | Valid: 0.85459
Max Depth: 20 | Train: 0.98838 | Valid: 0.88649


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [31]:
%%time
rf_baseline = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)

skf = StratifiedKFold(n_splits=10)
valid_scores = []

for train_index, valid_index in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    rf_baseline.fit(X_tr, y_tr)
    
    t_score = rf_baseline.score(X_tr, y_tr)
    v_score = rf_baseline.score(X_val, y_val)
    valid_scores.append(v_score)
    
    print(f"train -  {t_score:.5f}   |   valid -  {v_score:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")

train -  0.96455   |   valid -  0.88148
train -  0.96208   |   valid -  0.91852
train -  0.96785   |   valid -  0.86667
train -  0.96455   |   valid -  0.89630
train -  0.96538   |   valid -  0.91111
train -  0.96538   |   valid -  0.88148
train -  0.97115   |   valid -  0.91852
train -  0.96867   |   valid -  0.85185
train -  0.97364   |   valid -  0.88060
train -  0.97941   |   valid -  0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204
CPU times: user 811 ms, sys: 17 ms, total: 828 ms
Wall time: 828 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [32]:
n_trees = [50, 100]
depths = [15, 20]

for n in n_trees:
    for d in depths:
        model = RandomForestClassifier(n_estimators=n, max_depth=d, random_state=21)
        from sklearn.model_selection import cross_val_score
        scores = cross_val_score(model, X_train, y_train, cv=skf)
        print(f"Trees: {n:3} | Depth: {d:2} | Valid Accuracy: {scores.mean():.5f}")

Trees:  50 | Depth: 15 | Valid Accuracy: 0.89018
Trees:  50 | Depth: 20 | Valid Accuracy: 0.90874
Trees: 100 | Depth: 15 | Valid Accuracy: 0.89612
Trees: 100 | Depth: 20 | Valid Accuracy: 0.91023


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [33]:
best_model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=21)
best_model.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",100
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",20
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [34]:
y_pred = best_model.predict(X_test)

In [35]:
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final accuracy on the test set: {final_accuracy:.5f}")

Final accuracy on the test set: 0.93195


In [36]:
analysis_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred})

error_rates = {}

for day in range(7):
    total_samples = len(analysis_df[analysis_df['actual'] == day])
    
    errors = len(analysis_df[(analysis_df['actual'] == day) & (analysis_df['predicted'] != day)])
    
    if total_samples > 0:
        error_rates[day] = (errors / total_samples) * 100
    else:
        error_rates[day] = 0

In [37]:
for day, rate in error_rates.items():
    print(f"Weekday {day}: Error Rate = {rate:.2f}%")

Weekday 0: Error Rate = 25.93%
Weekday 1: Error Rate = 9.09%
Weekday 2: Error Rate = 6.67%
Weekday 3: Error Rate = 2.50%
Weekday 4: Error Rate = 14.29%
Weekday 5: Error Rate = 5.56%
Weekday 6: Error Rate = 1.41%


In [38]:
worst_day = max(error_rates, key=error_rates.get)
print(f"\nThe model makes the most errors for weekday: {worst_day}")


The model makes the most errors for weekday: 0


In [40]:
joblib.dump(best_model, 'best_model.joblib')
print("Model saved successfully!")

Model saved successfully!
