# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv("data/dayofweek.csv")
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,-0.629151,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,-0.597248,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,-0.565345,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
X = df.drop('dayofweek', axis=1) # axis 1 - column, axis 0 - row
y = df['dayofweek']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
print("X_train shakli:", X_train.shape)
print("y_train shakli:", y_train.shape)
print("X_test shakli:", X_test.shape)
print("y_test shakli:", y_test.shape)

X_train shakli: (1348, 43)
y_train shakli: (1348,)
X_test shakli: (338, 43)
y_test shakli: (338,)


## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [5]:
%%time
logreg_baseline = LogisticRegression(random_state=21, fit_intercept=False)

# Stratified K-Fold cross-validationni sozlash
kf = StratifiedKFold(n_splits=10,  random_state=21) # shuffle=False bo'lganda natija to'g'ri chiqdi

train_scores = []
valid_scores = []

print("K-fold cross-validation natijalari:")

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    logreg_baseline.fit(X_train_fold, y_train_fold)

    train_accuracy = logreg_baseline.score(X_train_fold, y_train_fold)
    val_accuracy = logreg_baseline.score(X_val_fold, y_val_fold)

    train_scores.append(train_accuracy)
    valid_scores.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

average_accuracy = np.mean(valid_scores)
std_accuracy = np.std(valid_scores)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

K-fold cross-validation natijalari:




train - 0.62902   |   valid - 0.59259
train - 0.64633   |   valid - 0.62963
train - 0.63479   |   valid - 0.56296
train - 0.65622   |   valid - 0.61481
train - 0.63397   |   valid - 0.57778
train - 0.64056   |   valid - 0.59259
train - 0.64138   |   valid - 0.65926
train - 0.65952   |   valid - 0.56296
train - 0.64333   |   valid - 0.59701
train - 0.63674   |   valid - 0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
CPU times: total: 3.88 s
Wall time: 3.23 s


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [6]:
print("--- Penalty: 'none' ---")
logreg_none = LogisticRegression(penalty='none', solver='lbfgs', random_state=21, fit_intercept=False, max_iter=1000)

kf = StratifiedKFold(n_splits=10, random_state=21)

train_scores_none = []
valid_scores_none = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    logreg_none.fit(X_train_fold, y_train_fold)

    train_accuracy = logreg_none.score(X_train_fold, y_train_fold)
    val_accuracy = logreg_none.score(X_val_fold, y_val_fold)

    train_scores_none.append(train_accuracy)
    valid_scores_none.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores_none):.5f}")
print(f"Std is {np.std(valid_scores_none):.5f}")

--- Penalty: 'none' ---




train - 0.66612   |   valid - 0.63704
train - 0.65787   |   valid - 0.65926
train - 0.66694   |   valid - 0.57778
train - 0.66529   |   valid - 0.62963
train - 0.66694   |   valid - 0.62222
train - 0.65952   |   valid - 0.57778
train - 0.65045   |   valid - 0.69630
train - 0.68425   |   valid - 0.61481
train - 0.66474   |   valid - 0.62687
train - 0.65651   |   valid - 0.60448
Average accuracy on crossval is 0.62462
Std is 0.03379


In [7]:
print("\n--- Penalty: 'l1' ---")
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=21, fit_intercept=False, max_iter=1000)

train_scores_l1 = []
valid_scores_l1 = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    logreg_l1.fit(X_train_fold, y_train_fold)

    train_accuracy = logreg_l1.score(X_train_fold, y_train_fold)
    val_accuracy = logreg_l1.score(X_val_fold, y_val_fold)

    train_scores_l1.append(train_accuracy)
    valid_scores_l1.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores_l1):.5f}")
print(f"Std is {np.std(valid_scores_l1):.5f}")


--- Penalty: 'l1' ---
train - 0.61830   |   valid - 0.54815
train - 0.62737   |   valid - 0.62222
train - 0.60511   |   valid - 0.54074
train - 0.63644   |   valid - 0.62222
train - 0.62407   |   valid - 0.55556
train - 0.62325   |   valid - 0.58519
train - 0.61253   |   valid - 0.63704
train - 0.64716   |   valid - 0.58519
train - 0.63015   |   valid - 0.59701
train - 0.61367   |   valid - 0.59701
Average accuracy on crossval is 0.58903
Std is 0.03129


In [8]:
print("\n--- Penalty: 'l2' ---")
logreg_l2 = LogisticRegression(penalty='l2', solver='lbfgs', random_state=21, fit_intercept=False, max_iter=1000)

train_scores_l2 = []
valid_scores_l2 = []

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    logreg_l2.fit(X_train_fold, y_train_fold)

    train_accuracy = logreg_l2.score(X_train_fold, y_train_fold)
    val_accuracy = logreg_l2.score(X_val_fold, y_val_fold)

    train_scores_l2.append(train_accuracy)
    valid_scores_l2.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores_l2):.5f}")
print(f"Std is {np.std(valid_scores_l2):.5f}")


--- Penalty: 'l2' ---
train - 0.62902   |   valid - 0.59259
train - 0.64633   |   valid - 0.62963
train - 0.63479   |   valid - 0.56296
train - 0.65622   |   valid - 0.61481
train - 0.63397   |   valid - 0.57778
train - 0.64056   |   valid - 0.59259
train - 0.64138   |   valid - 0.65926
train - 0.65952   |   valid - 0.56296
train - 0.64333   |   valid - 0.59701
train - 0.63674   |   valid - 0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [9]:
svm_baseline = SVC(probability=True, kernel='linear', random_state=21)

kf = StratifiedKFold(n_splits=10, random_state=21)

train_scores = []
valid_scores = []

print("K-fold cross-validation natijalari:")

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Modelni o'rgatish
    svm_baseline.fit(X_train_fold, y_train_fold)

    # O'quv va validatsiya to'plamlaridagi aniqlikni hisoblash
    train_accuracy = svm_baseline.score(X_train_fold, y_train_fold)
    val_accuracy = svm_baseline.score(X_val_fold, y_val_fold)

    train_scores.append(train_accuracy)
    valid_scores.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

average_accuracy = np.mean(valid_scores)
std_accuracy = np.std(valid_scores)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

K-fold cross-validation natijalari:




train - 0.70486   |   valid - 0.65926
train - 0.69662   |   valid - 0.75556
train - 0.69415   |   valid - 0.62222
train - 0.70239   |   valid - 0.65185
train - 0.69085   |   valid - 0.65185
train - 0.68920   |   valid - 0.64444
train - 0.69250   |   valid - 0.72593
train - 0.70074   |   valid - 0.62222
train - 0.69605   |   valid - 0.61940
train - 0.71087   |   valid - 0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [10]:
c_values = [0.01, 0.1, 1, 10, 100]

for c_val in c_values:
    print(f"\n--- C qiymati: {c_val} ---")
    # SVM modelini berilgan C qiymati bilan yaratish
    svm_tuned = SVC(C=c_val, probability=True, kernel='linear', random_state=21)

    kf = StratifiedKFold(n_splits=10, random_state=21)

    train_scores_c = []
    valid_scores_c = []

    for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Modelni o'rgatish
        svm_tuned.fit(X_train_fold, y_train_fold)

        # O'quv va validatsiya to'plamlaridagi aniqlikni hisoblash
        train_accuracy = svm_tuned.score(X_train_fold, y_train_fold)
        val_accuracy = svm_tuned.score(X_val_fold, y_val_fold)

        train_scores_c.append(train_accuracy)
        valid_scores_c.append(val_accuracy)

        print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

    average_accuracy = np.mean(valid_scores_c)
    std_accuracy = np.std(valid_scores_c)

    print(f"Average accuracy on crossval is {average_accuracy:.5f}")
    print(f"Std is {std_accuracy:.5f}")


--- C qiymati: 0.01 ---




train - 0.37923   |   valid - 0.40000
train - 0.37923   |   valid - 0.40000
train - 0.38417   |   valid - 0.35556
train - 0.35449   |   valid - 0.36296
train - 0.38252   |   valid - 0.37037
train - 0.38087   |   valid - 0.38519
train - 0.37923   |   valid - 0.40000
train - 0.38252   |   valid - 0.37037
train - 0.38468   |   valid - 0.35075
train - 0.38386   |   valid - 0.35821
Average accuracy on crossval is 0.37534
Std is 0.01848

--- C qiymati: 0.1 ---




train - 0.58120   |   valid - 0.55556
train - 0.57543   |   valid - 0.56296
train - 0.57378   |   valid - 0.57037
train - 0.59275   |   valid - 0.57037
train - 0.58120   |   valid - 0.54815
train - 0.57955   |   valid - 0.54815
train - 0.57296   |   valid - 0.61481
train - 0.59192   |   valid - 0.54815
train - 0.59967   |   valid - 0.52985
train - 0.57825   |   valid - 0.57463
Average accuracy on crossval is 0.56230
Std is 0.02177

--- C qiymati: 1 ---




train - 0.70486   |   valid - 0.65926
train - 0.69662   |   valid - 0.75556
train - 0.69415   |   valid - 0.62222
train - 0.70239   |   valid - 0.65185
train - 0.69085   |   valid - 0.65185
train - 0.68920   |   valid - 0.64444
train - 0.69250   |   valid - 0.72593
train - 0.70074   |   valid - 0.62222
train - 0.69605   |   valid - 0.61940
train - 0.71087   |   valid - 0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359

--- C qiymati: 10 ---




train - 0.75021   |   valid - 0.72593
train - 0.77741   |   valid - 0.82963
train - 0.78566   |   valid - 0.68148
train - 0.76834   |   valid - 0.73333
train - 0.75185   |   valid - 0.77778
train - 0.75598   |   valid - 0.68889
train - 0.76257   |   valid - 0.74074
train - 0.77411   |   valid - 0.68889
train - 0.78254   |   valid - 0.71642
train - 0.78418   |   valid - 0.69403
Average accuracy on crossval is 0.72771
Std is 0.04417

--- C qiymati: 100 ---




train - 0.78401   |   valid - 0.74815
train - 0.79720   |   valid - 0.84444
train - 0.80956   |   valid - 0.72593
train - 0.79060   |   valid - 0.76296
train - 0.79060   |   valid - 0.77778
train - 0.79637   |   valid - 0.74815
train - 0.78401   |   valid - 0.77037
train - 0.80462   |   valid - 0.73333
train - 0.79819   |   valid - 0.70896
train - 0.79901   |   valid - 0.73881
Average accuracy on crossval is 0.75589
Std is 0.03550


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [11]:
tree_baseline = DecisionTreeClassifier(max_depth=10, random_state=21)

kf = StratifiedKFold(n_splits=10, random_state=21)

train_scores = []
valid_scores = []

print("K-fold cross-validation natijalari:")

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Modelni o'rgatish
    tree_baseline.fit(X_train_fold, y_train_fold)

    # O'quv va validatsiya to'plamlaridagi aniqlikni hisoblash
    train_accuracy = tree_baseline.score(X_train_fold, y_train_fold)
    val_accuracy = tree_baseline.score(X_val_fold, y_val_fold)

    train_scores.append(train_accuracy)
    valid_scores.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

average_accuracy = np.mean(valid_scores)
std_accuracy = np.std(valid_scores)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

K-fold cross-validation natijalari:
train - 0.81039   |   valid - 0.74074
train - 0.77741   |   valid - 0.74074
train - 0.83347   |   valid - 0.70370
train - 0.79720   |   valid - 0.76296
train - 0.82440   |   valid - 0.75556
train - 0.80379   |   valid - 0.68889
train - 0.80709   |   valid - 0.76296
train - 0.80132   |   valid - 0.65926
train - 0.80807   |   valid - 0.75373




train - 0.80478   |   valid - 0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [12]:
max_depth_values = [5, 15, 20, 25]

for depth in max_depth_values:
    print(f"\n--- max_depth: {depth} ---")
    tree_tuned = DecisionTreeClassifier(max_depth=depth, random_state=21)

    kf = StratifiedKFold(n_splits=10, random_state=21)

    train_scores_depth = []
    valid_scores_depth = []

    for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        tree_tuned.fit(X_train_fold, y_train_fold)

        # O'quv va validatsiya to'plamlaridagi aniqlikni hisoblash
        train_accuracy = tree_tuned.score(X_train_fold, y_train_fold)
        val_accuracy = tree_tuned.score(X_val_fold, y_val_fold)

        train_scores_depth.append(train_accuracy)
        valid_scores_depth.append(val_accuracy)

        print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

    average_accuracy = np.mean(valid_scores_depth)
    std_accuracy = np.std(valid_scores_depth)

    print(f"Average accuracy on crossval is {average_accuracy:.5f}")
    print(f"Std is {std_accuracy:.5f}")


print("\n--- Bonus: Boshqa regularizatsiya parametrlari bilan o'ynash ---")

print("\nKombinatsiya 1: max_depth=10, min_samples_split=10, min_samples_leaf=5")
tree_bonus1 = DecisionTreeClassifier(max_depth=10, min_samples_split=10, min_samples_leaf=5, random_state=21)

train_scores_b1 = []
valid_scores_b1 = []

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    tree_bonus1.fit(X_train_fold, y_train_fold)

    train_accuracy = tree_bonus1.score(X_train_fold, y_train_fold)
    val_accuracy = tree_bonus1.score(X_val_fold, y_val_fold)

    train_scores_b1.append(train_accuracy)
    valid_scores_b1.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores_b1):.5f}")
print(f"Std is {np.std(valid_scores_b1):.5f}")

print("\nKombinatsiya 2: max_depth=15, max_features='sqrt'")
tree_bonus2 = DecisionTreeClassifier(max_depth=15, max_features='sqrt', random_state=21) # 'sqrt' - xususiyatlar sonining kvadrat ildizi

train_scores_b2 = []
valid_scores_b2 = []

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    tree_bonus2.fit(X_train_fold, y_train_fold)

    train_accuracy = tree_bonus2.score(X_train_fold, y_train_fold)
    val_accuracy = tree_bonus2.score(X_val_fold, y_val_fold)

    train_scores_b2.append(train_accuracy)
    valid_scores_b2.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores_b2):.5f}")
print(f"Std is {np.std(valid_scores_b2):.5f}")


--- max_depth: 5 ---
train - 0.59522   |   valid - 0.53333
train - 0.56307   |   valid - 0.53333
train - 0.60181   |   valid - 0.55556
train - 0.59604   |   valid - 0.57037
train - 0.60264   |   valid - 0.57778
train - 0.57955   |   valid - 0.53333
train - 0.58368   |   valid - 0.54815
train - 0.59275   |   valid - 0.51111
train - 0.58237   |   valid - 0.56716
train - 0.60132   |   valid - 0.50000
Average accuracy on crossval is 0.54301
Std is 0.02423

--- max_depth: 15 ---
train - 0.95796   |   valid - 0.82222
train - 0.93075   |   valid - 0.83704
train - 0.95631   |   valid - 0.83704




train - 0.95301   |   valid - 0.86667
train - 0.95136   |   valid - 0.88889
train - 0.94724   |   valid - 0.82222
train - 0.95466   |   valid - 0.90370
train - 0.94971   |   valid - 0.87407
train - 0.95305   |   valid - 0.83582
train - 0.94316   |   valid - 0.85821
Average accuracy on crossval is 0.85459
Std is 0.02682

--- max_depth: 20 ---
train - 0.98846   |   valid - 0.86667
train - 0.99011   |   valid - 0.91111
train - 0.98681   |   valid - 0.85926
train - 0.98763   |   valid - 0.91111
train - 0.98928   |   valid - 0.88148
train - 0.98186   |   valid - 0.85926
train - 0.98846   |   valid - 0.91852
train - 0.99176   |   valid - 0.89630
train - 0.99094   |   valid - 0.88060




train - 0.98847   |   valid - 0.88060
Average accuracy on crossval is 0.88649
Std is 0.02075

--- max_depth: 25 ---
train - 1.00000   |   valid - 0.85926
train - 1.00000   |   valid - 0.91852
train - 0.99918   |   valid - 0.86667
train - 1.00000   |   valid - 0.91111
train - 0.99918   |   valid - 0.88889
train - 0.99835   |   valid - 0.85185
train - 0.99753   |   valid - 0.92593
train - 1.00000   |   valid - 0.88148
train - 1.00000   |   valid - 0.88060
train - 1.00000   |   valid - 0.88060
Average accuracy on crossval is 0.88649
Std is 0.02371

--- Bonus: Boshqa regularizatsiya parametrlari bilan o'ynash ---

Kombinatsiya 1: max_depth=10, min_samples_split=10, min_samples_leaf=5
train - 0.75021   |   valid - 0.73333
train - 0.74361   |   valid - 0.72593
train - 0.75350   |   valid - 0.65185
train - 0.73042   |   valid - 0.68889
train - 0.75350   |   valid - 0.66667
train - 0.73372   |   valid - 0.68889
train - 0.74773   |   valid - 0.60741
train - 0.76669   |   valid - 0.64444
train -

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [13]:
# n_estimators=50 - o'rmondagi daraxtlar soni.
forest_baseline = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)

kf = StratifiedKFold(n_splits=10, random_state=21)

train_scores = []
valid_scores = []

print("K-fold cross-validation natijalari:")

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Modelni o'rgatish
    forest_baseline.fit(X_train_fold, y_train_fold)

    # O'quv va validatsiya to'plamlaridagi aniqlikni hisoblash
    train_accuracy = forest_baseline.score(X_train_fold, y_train_fold)
    val_accuracy = forest_baseline.score(X_val_fold, y_val_fold)

    train_scores.append(train_accuracy)
    valid_scores.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

average_accuracy = np.mean(valid_scores)
std_accuracy = np.std(valid_scores)

print(f"Average accuracy on crossval is {average_accuracy:.5f}")
print(f"Std is {std_accuracy:.5f}")

K-fold cross-validation natijalari:




train - 0.96455   |   valid - 0.88148
train - 0.96208   |   valid - 0.91852
train - 0.96785   |   valid - 0.86667
train - 0.96455   |   valid - 0.89630
train - 0.96538   |   valid - 0.91111
train - 0.96538   |   valid - 0.88148
train - 0.97115   |   valid - 0.91852
train - 0.96867   |   valid - 0.85185
train - 0.97364   |   valid - 0.88060
train - 0.97941   |   valid - 0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [14]:
n_estimators_values = [30, 70, 100]
max_depth_values = [10, 18, 25]

for n_est in n_estimators_values:
    for depth in max_depth_values:
        print(f"\n--- n_estimators: {n_est}, max_depth: {depth} ---")
        forest_tuned = RandomForestClassifier(n_estimators=n_est, max_depth=depth, random_state=21)

        kf = StratifiedKFold(n_splits=10, random_state=21)

        train_scores_comb = []
        valid_scores_comb = []

        for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
            X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            # Modelni o'rgatish
            forest_tuned.fit(X_train_fold, y_train_fold)

            # O'quv va validatsiya to'plamlaridagi aniqlikni hisoblash
            train_accuracy = forest_tuned.score(X_train_fold, y_train_fold)
            val_accuracy = forest_tuned.score(X_val_fold, y_val_fold)

            train_scores_comb.append(train_accuracy)
            valid_scores_comb.append(val_accuracy)

            print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

        average_accuracy = np.mean(valid_scores_comb)
        std_accuracy = np.std(valid_scores_comb)

        print(f"Average accuracy on crossval is {average_accuracy:.5f}")
        print(f"Std is {std_accuracy:.5f}")


--- n_estimators: 30, max_depth: 10 ---
train - 0.82852   |   valid - 0.76296
train - 0.85408   |   valid - 0.80741




train - 0.87139   |   valid - 0.80741
train - 0.86480   |   valid - 0.80000
train - 0.88458   |   valid - 0.86667
train - 0.88046   |   valid - 0.76296
train - 0.85573   |   valid - 0.80741
train - 0.85491   |   valid - 0.72593
train - 0.89456   |   valid - 0.80597
train - 0.87974   |   valid - 0.76119
Average accuracy on crossval is 0.79079
Std is 0.03679

--- n_estimators: 30, max_depth: 18 ---
train - 0.99340   |   valid - 0.89630
train - 0.99176   |   valid - 0.94074




train - 0.99176   |   valid - 0.89630
train - 0.99258   |   valid - 0.90370
train - 0.98681   |   valid - 0.91111
train - 0.99588   |   valid - 0.89630
train - 0.99423   |   valid - 0.91111
train - 0.99505   |   valid - 0.88148
train - 0.99588   |   valid - 0.90299
train - 0.98929   |   valid - 0.88806
Average accuracy on crossval is 0.90281
Std is 0.01542

--- n_estimators: 30, max_depth: 25 ---




train - 0.99918   |   valid - 0.90370
train - 0.99835   |   valid - 0.93333
train - 0.99753   |   valid - 0.88889
train - 0.99918   |   valid - 0.94074
train - 0.99753   |   valid - 0.91852
train - 0.99918   |   valid - 0.88889
train - 0.99918   |   valid - 0.92593
train - 0.99918   |   valid - 0.89630
train - 0.99918   |   valid - 0.91045
train - 0.99918   |   valid - 0.89552
Average accuracy on crossval is 0.91023
Std is 0.01773

--- n_estimators: 70, max_depth: 10 ---




train - 0.86315   |   valid - 0.77778
train - 0.87634   |   valid - 0.83704
train - 0.88788   |   valid - 0.80000
train - 0.90107   |   valid - 0.82222
train - 0.88293   |   valid - 0.85185
train - 0.87881   |   valid - 0.77037
train - 0.87716   |   valid - 0.82963
train - 0.86562   |   valid - 0.75556
train - 0.89127   |   valid - 0.82090
train - 0.89292   |   valid - 0.76866
Average accuracy on crossval is 0.80340
Std is 0.03175

--- n_estimators: 70, max_depth: 18 ---




train - 0.99176   |   valid - 0.90370
train - 0.99423   |   valid - 0.94074
train - 0.99340   |   valid - 0.89630
train - 0.99505   |   valid - 0.92593
train - 0.99176   |   valid - 0.91852
train - 0.99423   |   valid - 0.88148
train - 0.99423   |   valid - 0.91852
train - 0.99670   |   valid - 0.87407
train - 0.99423   |   valid - 0.91791
train - 0.99094   |   valid - 0.89552
Average accuracy on crossval is 0.90727
Std is 0.01968

--- n_estimators: 70, max_depth: 25 ---




train - 0.99918   |   valid - 0.90370
train - 0.99918   |   valid - 0.95556
train - 0.99918   |   valid - 0.90370
train - 0.99918   |   valid - 0.94074
train - 0.99918   |   valid - 0.91852
train - 0.99753   |   valid - 0.89630
train - 0.99918   |   valid - 0.92593
train - 0.99918   |   valid - 0.88889
train - 1.00000   |   valid - 0.92537
train - 1.00000   |   valid - 0.91045
Average accuracy on crossval is 0.91692
Std is 0.01954

--- n_estimators: 100, max_depth: 10 ---
train - 0.86562   |   valid - 0.77037




train - 0.88293   |   valid - 0.85185
train - 0.89118   |   valid - 0.80000
train - 0.90107   |   valid - 0.82963
train - 0.87634   |   valid - 0.84444
train - 0.87716   |   valid - 0.77037
train - 0.87057   |   valid - 0.81481
train - 0.87799   |   valid - 0.76296
train - 0.88056   |   valid - 0.79104
train - 0.88056   |   valid - 0.78358
Average accuracy on crossval is 0.80191
Std is 0.03034

--- n_estimators: 100, max_depth: 18 ---




train - 0.99258   |   valid - 0.89630
train - 0.99505   |   valid - 0.93333
train - 0.99340   |   valid - 0.89630
train - 0.99588   |   valid - 0.92593
train - 0.99093   |   valid - 0.91852
train - 0.99423   |   valid - 0.88889
train - 0.99340   |   valid - 0.91111
train - 0.99505   |   valid - 0.88889
train - 0.99423   |   valid - 0.92537
train - 0.99259   |   valid - 0.89552
Average accuracy on crossval is 0.90802
Std is 0.01595

--- n_estimators: 100, max_depth: 25 ---




train - 0.99918   |   valid - 0.90370
train - 0.99918   |   valid - 0.96296
train - 0.99918   |   valid - 0.89630
train - 1.00000   |   valid - 0.94074
train - 0.99918   |   valid - 0.91852
train - 0.99835   |   valid - 0.89630
train - 0.99918   |   valid - 0.92593
train - 0.99918   |   valid - 0.89630
train - 1.00000   |   valid - 0.93284
train - 1.00000   |   valid - 0.90299
Average accuracy on crossval is 0.91766
Std is 0.02160


In [15]:
print("\nKombinatsiya 1: n_estimators=70, max_depth=18, min_samples_split=5, min_samples_leaf=2")
forest_bonus1 = RandomForestClassifier(n_estimators=70, max_depth=18, min_samples_split=5, min_samples_leaf=2, random_state=21)

train_scores_b1 = []
valid_scores_b1 = []

kf = StratifiedKFold(n_splits=10, random_state=21)

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    forest_bonus1.fit(X_train_fold, y_train_fold)

    train_accuracy = forest_bonus1.score(X_train_fold, y_train_fold)
    val_accuracy = forest_bonus1.score(X_val_fold, y_val_fold)

    train_scores_b1.append(train_accuracy)
    valid_scores_b1.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores_b1):.5f}")
print(f"Std is {np.std(valid_scores_b1):.5f}")


Kombinatsiya 1: n_estimators=70, max_depth=18, min_samples_split=5, min_samples_leaf=2
train - 0.92333   |   valid - 0.84444




train - 0.93570   |   valid - 0.88889
train - 0.93405   |   valid - 0.85185
train - 0.93487   |   valid - 0.85926
train - 0.93487   |   valid - 0.89630
train - 0.92168   |   valid - 0.85926
train - 0.93322   |   valid - 0.89630
train - 0.92828   |   valid - 0.81481
train - 0.92751   |   valid - 0.86567
train - 0.93328   |   valid - 0.85075
Average accuracy on crossval is 0.86275
Std is 0.02421


In [16]:
print("\nKombinatsiya 2: n_estimators=100, max_depth=14, max_features='sqrt', bootstrap=False")
# bootstrap=False - har bir daraxt butun o'quv ma'lumotlar to'plamida o'qitiladi (o'rin almashtirmasdan namuna olinmaydi)
forest_bonus2 = RandomForestClassifier(n_estimators=100, max_depth=14, max_features='sqrt', bootstrap=False, random_state=21)

train_scores_b2 = []
valid_scores_b2 = []

kf = StratifiedKFold(n_splits=10, random_state=21)

for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    forest_bonus2.fit(X_train_fold, y_train_fold)

    train_accuracy = forest_bonus2.score(X_train_fold, y_train_fold)
    val_accuracy = forest_bonus2.score(X_val_fold, y_val_fold)

    train_scores_b2.append(train_accuracy)
    valid_scores_b2.append(val_accuracy)

    print(f"train - {train_accuracy:.5f}   |   valid - {val_accuracy:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores_b2):.5f}")
print(f"Std is {np.std(valid_scores_b2):.5f}")


Kombinatsiya 2: n_estimators=100, max_depth=14, max_features='sqrt', bootstrap=False
train - 0.97527   |   valid - 0.90370




train - 0.97692   |   valid - 0.90370
train - 0.98434   |   valid - 0.89630
train - 0.97692   |   valid - 0.89630
train - 0.97444   |   valid - 0.91111
train - 0.98516   |   valid - 0.86667
train - 0.97774   |   valid - 0.91111
train - 0.97527   |   valid - 0.88889
train - 0.97694   |   valid - 0.88060
train - 0.98188   |   valid - 0.88060
Average accuracy on crossval is 0.89390
Std is 0.01385


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [17]:
# Eng yaxshi modelni aniqlash
best_model = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21, max_features='sqrt', bootstrap=False)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Modelning test ma'lumotlar to'plamidagi yakuniy aniqligi: {final_accuracy:.5f}")
print(classification_report(y_test, y_pred))

Modelning test ma'lumotlar to'plamidagi yakuniy aniqligi: 0.93787
              precision    recall  f1-score   support

           0       0.87      0.74      0.80        27
           1       1.00      0.95      0.97        55
           2       1.00      0.93      0.97        30
           3       0.94      0.97      0.96        80
           4       0.90      0.86      0.88        21
           5       0.88      0.94      0.91        54
           6       0.95      0.99      0.97        71

    accuracy                           0.94       338
   macro avg       0.93      0.91      0.92       338
weighted avg       0.94      0.94      0.94       338



In [18]:
# Har bir sinf bo'yicha xatolarni tahlil qilish
errors_by_class = {}
total_by_class = y_test.value_counts().sort_index() # Har bir sinfning test to'plamidagi umumiy soni

for i in range(len(y_test)):
    true_label = y_test.iloc[i]
    predicted_label = y_pred[i]

    if true_label != predicted_label:
        if true_label not in errors_by_class:
            errors_by_class[true_label] = 0
        errors_by_class[true_label] += 1

print("\nModelning haftaning kunlari bo'yicha xato foizlari:")
for weekday, error_count in sorted(errors_by_class.items()):
    total_samples = total_by_class[weekday]
    error_percentage = (error_count / total_samples) * 100
    print(f"Hafta kuni {weekday}: {error_count} ta xato / {total_samples} ta umumiy namuna ({error_percentage:.2f}%)")

# Qaysi haftaning kuni eng ko'p xato qilganini aniqlash
if errors_by_class:
    most_errors_weekday = max(errors_by_class, key=errors_by_class.get)
    most_errors_percentage = (errors_by_class[most_errors_weekday] / total_by_class[most_errors_weekday]) * 100
    print(f"\nModel eng ko'p xatolarni Haftaning kuni {most_errors_weekday} uchun qildi: {most_errors_percentage:.2f}% xato.")
else:
    print("\nModel test ma'lumotlarida xato qilmadi (bu juda kam ehtimol).")


Modelning haftaning kunlari bo'yicha xato foizlari:
Hafta kuni 0: 7 ta xato / 27 ta umumiy namuna (25.93%)
Hafta kuni 1: 3 ta xato / 55 ta umumiy namuna (5.45%)
Hafta kuni 2: 2 ta xato / 30 ta umumiy namuna (6.67%)
Hafta kuni 3: 2 ta xato / 80 ta umumiy namuna (2.50%)
Hafta kuni 4: 3 ta xato / 21 ta umumiy namuna (14.29%)
Hafta kuni 5: 3 ta xato / 54 ta umumiy namuna (5.56%)
Hafta kuni 6: 1 ta xato / 71 ta umumiy namuna (1.41%)

Model eng ko'p xatolarni Haftaning kuni 0 uchun qildi: 25.93% xato.


In [19]:
model_filename = "best_regularized_model.pkl"
joblib.dump(best_model, model_filename)

['best_regularized_model.pkl']