# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
import joblib

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df_not_scaled = pd.read_csv('data/day-of-week-not-scaled.csv')
df_dayofweek = pd.read_csv('../ex00/data/dayofweek.csv')
df_not_scaled['dayofweek'] = df_dayofweek['dayofweek']
df_not_scaled

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [3]:
X = df_not_scaled.drop('dayofweek', axis=1)
y = df_not_scaled['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y) # test_size=0.2 (20% test uchun)

print(f"\nX_train shakli: {X_train.shape}")
print(f"y_train shakli: {y_train.shape}")
print(f"X_test shakli: {X_test.shape}")
print(f"y_test shakli: {y_test.shape}")


X_train shakli: (1348, 43)
y_train shakli: (1348,)
X_test shakli: (338, 43)
y_test shakli: (338,)


## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [4]:
warnings.filterwarnings('ignore')

In [5]:
svm = SVC(random_state=21, probability=True)

param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

# cv=10 StratifiedKFold bilan bir xil ishlaydi, chunki GridSearchCV ichida ham shunday mexanizm mavjud.
# n_jobs=-1 - barcha mavjud CPU yadrolarini ishlatadi, bu tezroq ishlashga yordam beradi.
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search_svm.fit(X_train, y_train)

best_params_found = grid_search_svm.best_params_
fixed_params = {'probability': True, 'random_state': 21}

full_best_params = {**best_params_found, **fixed_params}

print(f"Eng yaxshi aniqlik (Best Score): {grid_search_svm.best_score_:.5f}")
print(f"Eng yaxshi parametrlar kombinatsiyasi (Best Parameters): {full_best_params}")

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 19.9min finished


Eng yaxshi aniqlik (Best Score): 0.88575
Eng yaxshi parametrlar kombinatsiyasi (Best Parameters): {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}


In [6]:
results_df_svm = pd.DataFrame(grid_search_svm.cv_results_)

# 'rank_test_score' - test to'plamidagi aniqlik reytingi (1 eng yaxshi)
# 'mean_test_score' - kross-validatsiyada o'rtacha aniqlik
# 'std_test_score' - kross-validatsiyada aniqlikning standart og'ishi
results_df_svm_sorted = results_df_svm.sort_values(by='rank_test_score')

columns_to_show = [
    'param_C',
    'param_class_weight',
    'param_gamma',
    'param_kernel',
    'mean_test_score',
    'std_test_score',
    'rank_test_score'
]

print("\nGridSearchCV natijalari (eng yaxshi reyting bo'yicha saralangan):")
print(results_df_svm_sorted[columns_to_show].head(10))

# 'mean_test_score' va 'rank_test_score' ustunlariga e'tibor bering.
# Eng yaxshi natija (rank_test_score=1) va unga yaqin bo'lgan boshqa natijalarni solishtiring.
# Agar 1-o'rindagi model va 2-3-o'rindagi modellar orasida 'mean_test_score' bo'yicha katta farq bo'lmasa,
# ba'zida soddaroq parametrlar (masalan, kichikroq C yoki 'linear' kernel) afzalroq bo'lishi mumkin.

print("\nNatijalarni tahlil qilish:")
best_row = results_df_svm_sorted.iloc[0]
print(f"Eng yaxshi natija: C={best_row['param_C']}, kernel='{best_row['param_kernel']}', gamma='{best_row['param_gamma']}', class_weight={best_row['param_class_weight']}")
print(f"O'rtacha aniqlik: {best_row['mean_test_score']:.5f}, Standart og'ish: {best_row['std_test_score']:.5f}")

if len(results_df_svm_sorted) > 1:
    second_best_row = results_df_svm_sorted.iloc[1]
    score_diff = best_row['mean_test_score'] - second_best_row['mean_test_score']
    print(f"Eng yaxshi va ikkinchi eng yaxshi natija o'rtasidagi farq: {score_diff:.5f}")
    if score_diff < 0.005:
        print("Eng yaxshi natijalar orasida katta farq yo'q. Soddaroq parametrlarni ham ko'rib chiqish mumkin.")


GridSearchCV natijalari (eng yaxshi reyting bo'yicha saralangan):
   param_C param_class_weight param_gamma param_kernel  mean_test_score  \
70      10               None        auto          rbf         0.885749   
64      10           balanced        auto          rbf         0.873875   
58       5               None        auto          rbf         0.832349   
52       5           balanced        auto          rbf         0.820453   
66      10               None       scale       linear         0.728452   
69      10               None        auto       linear         0.728452   
60      10           balanced       scale       linear         0.720260   
63      10           balanced        auto       linear         0.720260   
48       5           balanced       scale       linear         0.713577   
51       5           balanced        auto       linear         0.713577   

    std_test_score  rank_test_score  
70        0.018843                1  
64        0.022551             

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [7]:
tree = DecisionTreeClassifier(random_state=21)

param_grid_tree = {
    'max_depth': np.arange(1, 50),
    'class_weight': ['balanced', None],
    'criterion': ['gini', 'entropy']
}

grid_search_tree = GridSearchCV(estimator=tree, param_grid=param_grid_tree, scoring='accuracy', n_jobs=-1)

grid_search_tree.fit(X_train, y_train)

best_params_found = grid_search_tree.best_params_

full_best_params = {**best_params_found, 'random_state': 21}

print(f"Eng yaxshi aniqlik (Best Score): {grid_search_tree.best_score_:.5f}")
print(f"Eng yaxshi parametrlar kombinatsiyasi (Best Parameters): {full_best_params}")

Eng yaxshi aniqlik (Best Score): 0.87386
Eng yaxshi parametrlar kombinatsiyasi (Best Parameters): {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}


In [8]:
results_df_tree = pd.DataFrame(grid_search_tree.cv_results_)

results_df_tree_sorted = results_df_tree.sort_values(by='rank_test_score')

# Faqat muhim ustunlarni ko'rsatish
columns_to_show_tree = [
    'param_max_depth',
    'param_class_weight',
    'param_criterion',
    'mean_test_score',
    'std_test_score',
    'rank_test_score'
]

print("\nGridSearchCV natijalari (eng yaxshi reyting bo'yicha saralangan):")
print(results_df_tree_sorted[columns_to_show_tree].head(10))

best_row_tree = results_df_tree_sorted.iloc[0]
print(f"Eng yaxshi natija: max_depth={best_row_tree['param_max_depth']}, class_weight={best_row_tree['param_class_weight']}, criterion='{best_row_tree['param_criterion']}'")
print(f"O'rtacha aniqlik: {best_row_tree['mean_test_score']:.5f}, Standart og'ish: {best_row_tree['std_test_score']:.5f}")

# Eng yaxshi va ikkinchi eng yaxshi natija orasidagi farqni tekshirish
if len(results_df_tree_sorted) > 1:
    second_best_row_tree = results_df_tree_sorted.iloc[1]
    score_diff_tree = best_row_tree['mean_test_score'] - second_best_row_tree['mean_test_score']
    print(f"Eng yaxshi va ikkinchi eng yaxshi natija o'rtasidagi farq: {score_diff_tree:.5f}")
    if score_diff_tree < 0.005: # Kichik farqni ko'rsatuvchi shart
        print("Eng yaxshi natijalar orasida katta farq yo'q. Ehtimol, kichikroq max_depth qiymati bilan ham yaxshi natija olish mumkin.")
    else:
        print("Eng yaxshi natija sezilarli darajada ustun.")


GridSearchCV natijalari (eng yaxshi reyting bo'yicha saralangan):
   param_max_depth param_class_weight param_criterion  mean_test_score  \
20              21           balanced            gini         0.873865   
24              25           balanced            gini         0.873854   
21              22           balanced            gini         0.872378   
30              31           balanced            gini         0.872372   
28              29           balanced            gini         0.872372   
27              28           balanced            gini         0.872372   
26              27           balanced            gini         0.872372   
22              23           balanced            gini         0.872372   
32              33           balanced            gini         0.872372   
33              34           balanced            gini         0.872372   

    std_test_score  rank_test_score  
20        0.025066                1  
24        0.025018                2  
21  

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
forest = RandomForestClassifier(random_state=21)

param_grid_forest = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': np.arange(1, 50),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

grid_search_forest = GridSearchCV(estimator=forest, param_grid=param_grid_forest, scoring='accuracy', n_jobs=-1)

grid_search_forest.fit(X_train, y_train)

best_params_found = grid_search_forest.best_params_

full_best_params = {**best_params_found, 'random_state': 21}

print(f"Eng yaxshi aniqlik (Best Score): {grid_search_forest.best_score_:.5f}")
print(f"Eng yaxshi parametrlar kombinatsiyasi (Best Parameters): {full_best_params}")

Eng yaxshi aniqlik (Best Score): 0.90429
Eng yaxshi parametrlar kombinatsiyasi (Best Parameters): {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 24, 'n_estimators': 100, 'random_state': 21}


In [10]:
results_df_forest = pd.DataFrame(grid_search_forest.cv_results_)

results_df_forest_sorted = results_df_forest.sort_values(by='rank_test_score')

columns_to_show_forest = [
    'param_n_estimators',
    'param_max_depth',
    'param_class_weight',
    'param_criterion',
    'mean_test_score',
    'std_test_score',
    'rank_test_score'
]

print("\nGridSearchCV natijalari (eng yaxshi reyting bo'yicha saralangan):")
print(results_df_forest_sorted[columns_to_show_forest].head(10))

print("\nNatijalarni tahlil qilish:")
best_row_forest = results_df_forest_sorted.iloc[0]
print(f"Eng yaxshi natija: n_estimators={best_row_forest['param_n_estimators']}, max_depth={best_row_forest['param_max_depth']}, class_weight={best_row_forest['param_class_weight']}, criterion='{best_row_forest['param_criterion']}'")
print(f"O'rtacha aniqlik: {best_row_forest['mean_test_score']:.5f}, Standart og'ish: {best_row_forest['std_test_score']:.5f}")

if len(results_df_forest_sorted) > 1:
    second_best_row_forest = results_df_forest_sorted.iloc[1]
    score_diff_forest = best_row_forest['mean_test_score'] - second_best_row_forest['mean_test_score']
    print(f"Eng yaxshi va ikkinchi eng yaxshi natija o'rtasidagi farq: {score_diff_forest:.5f}")
    if score_diff_forest < 0.005:
        print("Eng yaxshi natijalar orasida katta farq yo'q.")
    else:
        print("Eng yaxshi natija sezilarli darajada ustun.")


GridSearchCV natijalari (eng yaxshi reyting bo'yicha saralangan):
    param_n_estimators param_max_depth param_class_weight param_criterion  \
95                 100              24           balanced         entropy   
115                100              29           balanced         entropy   
698                 50              28               None            gini   
314                 50              30           balanced            gini   
711                100              31               None            gini   
99                 100              25           balanced         entropy   
326                 50              33           balanced            gini   
767                100              45               None            gini   
779                100              48               None            gini   
775                100              47               None            gini   

     mean_test_score  std_test_score  rank_test_score  
95          0.904293        0

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [11]:
n_estimators_values = [5, 10, 50, 100]
max_depth_values = np.arange(1, 50)
class_weight_values = ['balanced', None]
criterion_values = ['entropy', 'gini']

results = []

# Barcha kombinatsiyalar sonini hisoblash (tqdm uchun umumiy iteratsiya soni)
total_combinations = (len(n_estimators_values) *
                      len(max_depth_values) *
                      len(class_weight_values) *
                      len(criterion_values))

print(f"Jami kombinatsiyalar soni: {total_combinations}")

# Progress bar bilan qo'lda GridSearch
with tqdm(total=total_combinations, desc="Grid Search Progress") as pbar:
    for n_est in n_estimators_values:
        for depth in max_depth_values:
            for cw in class_weight_values:
                for crit in criterion_values:
                    model = RandomForestClassifier(n_estimators=n_est,
                                                   max_depth=depth,
                                                   class_weight=cw,
                                                   criterion=crit,
                                                   random_state=21)

                    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', n_jobs=-1)

                    results.append({
                        'n_estimators': n_est,
                        'max_depth': depth,
                        'class_weight': cw,
                        'criterion': crit,
                        'mean_accuracy': np.mean(scores),
                        'std_accuracy': np.std(scores)
                    })
                    pbar.update(1)

Jami kombinatsiyalar soni: 784


HBox(children=(FloatProgress(value=0.0, description='Grid Search Progress', max=784.0, style=ProgressStyle(des…




In [12]:
results_df_manual = pd.DataFrame(results)
results_df_manual

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,mean_accuracy,std_accuracy
0,5,1,balanced,entropy,0.270794,0.024718
1,5,1,balanced,gini,0.283390,0.011062
2,5,1,,entropy,0.353832,0.016467
3,5,1,,gini,0.364219,0.021651
4,5,2,balanced,entropy,0.353110,0.021165
...,...,...,...,...,...,...
779,100,48,,gini,0.902806,0.010460
780,100,49,balanced,entropy,0.898359,0.013872
781,100,49,balanced,gini,0.894652,0.015726
782,100,49,,entropy,0.898362,0.014986


In [13]:
results_df_manual_sorted = results_df_manual.sort_values(by='mean_accuracy', ascending=False)

print("\nNatijalar (eng yuqori o'rtacha aniqlik bo'yicha saralangan):")
print(results_df_manual_sorted.head(10))

print("\nNatijalarni tahlil qilish:")
best_row_manual = results_df_manual_sorted.iloc[0]
print(f"Eng yaxshi natija: n_estimators={best_row_manual['n_estimators']}, max_depth={best_row_manual['max_depth']}, class_weight={best_row_manual['class_weight']}, criterion='{best_row_manual['criterion']}'")
print(f"O'rtacha aniqlik: {best_row_manual['mean_accuracy']:.5f}, Standart og'ish: {best_row_manual['std_accuracy']:.5f}")

if len(results_df_manual_sorted) > 1:
    second_best_row_manual = results_df_manual_sorted.iloc[1]
    score_diff_manual = best_row_manual['mean_accuracy'] - second_best_row_manual['mean_accuracy']
    print(f"Eng yaxshi va ikkinchi eng yaxshi natija o'rtasidagi farq: {score_diff_manual:.5f}")
    if score_diff_manual < 0.005:
        print("Eng yaxshi natijalar orasida katta farq yo'q.")
    else:
        print("Eng yaxshi natija sezilarli darajada ustun.")


Natijalar (eng yuqori o'rtacha aniqlik bo'yicha saralangan):
     n_estimators  max_depth class_weight criterion  mean_accuracy  \
680           100         24     balanced   entropy       0.904293   
503            50         28         None      gini       0.904290   
700           100         29     balanced   entropy       0.904290   
509            50         30     balanced      gini       0.903549   
711           100         31         None      gini       0.903547   
684           100         25     balanced   entropy       0.902809   
521            50         33     balanced      gini       0.902809   
783           100         49         None      gini       0.902806   
507            50         29         None      gini       0.902806   
731           100         36         None      gini       0.902806   

     std_accuracy  
680      0.012361  
503      0.010961  
700      0.012156  
509      0.012056  
711      0.014380  
684      0.013639  
521      0.013628  
783    

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [14]:
# Eng yaxshi Random Forest modelini tanlash va yaratish
# Yuqorida GridSearch dan olingan eng yaxshi parametrlarni ishlatamiz
best_random_forest_model = RandomForestClassifier(
    n_estimators=100,         
    max_depth=21,        
    class_weight='balanced',
    criterion='gini',
    random_state=21
)

print("Tanlangan Random Forest modeli parametrlari:", best_random_forest_model.get_params())

best_random_forest_model.fit(X_train, y_train)

y_pred = best_random_forest_model.predict(X_test)

Tanlangan Random Forest modeli parametrlari: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 21, 'verbose': 0, 'warm_start': False}


In [15]:
final_accuracy = accuracy_score(y_test, y_pred)
print(f"\nModelning test ma'lumotlar to'plamidagi yakuniy aniqligi: {final_accuracy:.5f}")


Modelning test ma'lumotlar to'plamidagi yakuniy aniqligi: 0.92899
