# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from itertools import product
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df_not_scaled = pd.read_csv('../ex01/data/day-of-week-not-scaled.csv')
df_dayofweek = pd.read_csv('../ex00/data/dayofweek.csv')
df_not_scaled['dayofweek'] = df_dayofweek['dayofweek']
df_not_scaled

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [3]:
X = df_not_scaled.drop('dayofweek', axis=1)
y = df_not_scaled['dayofweek']

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

print(f"\nBirinchi ajratish (Train va Test):")
print(f"X_train_full shakli: {X_train_full.shape}")
print(f"y_train_full shakli: {y_train_full.shape}")
print(f"X_test shakli: {X_test.shape}")
print(f"y_test shakli: {y_test.shape}")


Birinchi ajratish (Train va Test):
X_train_full shakli: (1348, 43)
y_train_full shakli: (1348,)
X_test shakli: (338, 43)
y_test shakli: (338,)


In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=21, stratify=y_train_full)

print(f"\nIkkinchi ajratish (Train va Valid):")
print(f"X_train shakli: {X_train.shape}")
print(f"y_train shakli: {y_train.shape}")
print(f"X_valid shakli: {X_valid.shape}")
print(f"y_valid shakli: {y_valid.shape}")


Ikkinchi ajratish (Train va Valid):
X_train shakli: (1078, 43)
y_train shakli: (1078,)
X_valid shakli: (270, 43)
y_valid shakli: (270,)


## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [5]:
best_svm_params = {
    'C': 10,
    'class_weight': None,
    'gamma': 'auto',
    'kernel': 'rbf'
}

svm_classifier = SVC(random_state=21, probability=True, **best_svm_params)

svm_classifier.fit(X_train, y_train)

y_pred_svm_valid = svm_classifier.predict(X_valid)

accuracy_svm_valid = accuracy_score(y_valid, y_pred_svm_valid)
precision_svm_valid = precision_score(y_valid, y_pred_svm_valid, average='weighted', zero_division=0)
recall_svm_valid = recall_score(y_valid, y_pred_svm_valid, average='weighted', zero_division=0)

print(f"\naccuracy is {accuracy_svm_valid:.5f}")
print(f"precision is {precision_svm_valid:.5f}")
print(f"recall is {recall_svm_valid:.5f}")


accuracy is 0.87778
precision is 0.88162
recall is 0.87778


In [6]:
best_tree_params = {
    'class_weight': 'balanced',
    'criterion': 'gini',
    'max_depth': 21
}

tree_classifier = DecisionTreeClassifier(random_state=21, **best_tree_params)
tree_classifier.fit(X_train, y_train)

y_pred_tree_valid = tree_classifier.predict(X_valid)

accuracy_tree_valid = accuracy_score(y_valid, y_pred_tree_valid)
precision_tree_valid = precision_score(y_valid, y_pred_tree_valid, average='weighted', zero_division=0)
recall_tree_valid = recall_score(y_valid, y_pred_tree_valid, average='weighted', zero_division=0)

print(f"\naccuracy is {accuracy_tree_valid:.5f}")
print(f"precision is {precision_tree_valid:.5f}")
print(f"recall is {recall_tree_valid:.5f}")


accuracy is 0.86667
precision is 0.87170
recall is 0.86667


In [7]:
best_forest_params = {
    'n_estimators': 100,
    'max_depth': 24,
    'class_weight': 'balanced',
    'criterion': 'entropy'
}

forest_classifier = RandomForestClassifier(random_state=21, **best_forest_params)
forest_classifier.fit(X_train, y_train)

y_pred_forest_valid = forest_classifier.predict(X_valid)

accuracy_forest_valid = accuracy_score(y_valid, y_pred_forest_valid)
precision_forest_valid = precision_score(y_valid, y_pred_forest_valid, average='weighted', zero_division=0)
recall_forest_valid = recall_score(y_valid, y_pred_forest_valid, average='weighted', zero_division=0)

print(f"\naccuracy is {accuracy_forest_valid:.5f}")
print(f"precision is {precision_forest_valid:.5f}")
print(f"recall is {recall_forest_valid:.5f}")


accuracy is 0.89630
precision is 0.89698
recall is 0.89630


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [8]:
estimators = [
    ('svm', svm_classifier),
    ('tree', tree_classifier),
    ('forest', forest_classifier)
]

voting_clf_default = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

voting_clf_default.fit(X_train, y_train)

y_pred_voting_default_valid = voting_clf_default.predict(X_valid)

acc_v_d = accuracy_score(y_valid, y_pred_voting_default_valid)
prec_v_d = precision_score(y_valid, y_pred_voting_default_valid, average='weighted', zero_division=0)
rec_v_d = recall_score(y_valid, y_pred_voting_default_valid, average='weighted', zero_division=0)

print(f"\nDefault vaznlar (soft voting) bilan metrikalar (Validatsiya to'plamida):")
print(f"accuracy is {acc_v_d:.5f}")
print(f"precision is {prec_v_d:.5f}")
print(f"recall is {rec_v_d:.5f}")


Default vaznlar (soft voting) bilan metrikalar (Validatsiya to'plamida):
accuracy is 0.88519
precision is 0.88840
recall is 0.88519


In [9]:

best_accuracy_valid = 0
best_precision_valid = 0
best_weights = None
best_voting_type = None
best_voting_clf = None

# Vaznlar kombinatsiyasini sinab ko'rish (faqat soft voting uchun)
# Har bir modelga 1 dan 5 gacha vazn beramiz
weight_combinations = list(product(range(1, 6), repeat=len(estimators)))

print("\nTurli vaznlar va ovoza berish turlari bilan sinovlar o'tkazilmoqda...")

# Hard voting bilan sinab ko'rish
voting_clf_hard = VotingClassifier(estimators=estimators, voting='hard', n_jobs=-1)
voting_clf_hard.fit(X_train, y_train)
y_pred_hard_valid = voting_clf_hard.predict(X_valid)

acc_hard = accuracy_score(y_valid, y_pred_hard_valid)
prec_hard = precision_score(y_valid, y_pred_hard_valid, average='weighted', zero_division=0)
rec_hard = recall_score(y_valid, y_pred_hard_valid, average='weighted', zero_division=0)

print(f"\nHard Voting bilan metrikalar (Validatsiya to'plamida):")
print(f"accuracy is {acc_hard:.5f}")
print(f"precision is {prec_hard:.5f}")
print(f"recall is {rec_hard:.5f}")

# Eng yaxshi natijani yangilash (agar hard voting yaxshi bo'lsa)
if acc_hard > best_accuracy_valid:
    best_accuracy_valid = acc_hard
    best_precision_valid = prec_hard
    best_weights = None # Hard votingda vaznlar bo'lmaydi
    best_voting_type = 'hard'
    best_voting_clf = voting_clf_hard
elif acc_hard == best_accuracy_valid and prec_hard > best_precision_valid:
    best_precision_valid = prec_hard
    best_weights = None
    best_voting_type = 'hard'
    best_voting_clf = voting_clf_hard


# Soft voting va turli vaznlar bilan sinab ko'rish
for weights in weight_combinations:
    voting_clf_current = VotingClassifier(estimators=estimators, voting='soft', weights=weights, n_jobs=-1)
    voting_clf_current.fit(X_train, y_train)
    y_pred_current_valid = voting_clf_current.predict(X_valid)

    current_accuracy = accuracy_score(y_valid, y_pred_current_valid)
    current_precision = precision_score(y_valid, y_pred_current_valid, average='weighted', zero_division=0)

    # Eng yaxshi modelni tanlash: avval aniqlik bo'yicha, keyin precision bo'yicha
    if current_accuracy > best_accuracy_valid:
        best_accuracy_valid = current_accuracy
        best_precision_valid = current_precision
        best_weights = weights
        best_voting_type = 'soft'
        best_voting_clf = voting_clf_current
    elif current_accuracy == best_accuracy_valid and current_precision > best_precision_valid:
        best_precision_valid = current_precision
        best_weights = weights
        best_voting_type = 'soft'
        best_voting_clf = voting_clf_current

print(f"  Voting type: {best_voting_type}")
if best_weights:
    print(f"  Weights: {best_weights} (SVM, Tree, Forest uchun)")
print(f"  Validatsiya to'plamidagi eng yaxshi aniqlik: {best_accuracy_valid:.5f}")
print(f"  Validatsiya to'plamidagi mos keluvchi precision: {best_precision_valid:.5f}")


Turli vaznlar va ovoza berish turlari bilan sinovlar o'tkazilmoqda...

Hard Voting bilan metrikalar (Validatsiya to'plamida):
accuracy is 0.90000
precision is 0.89993
recall is 0.90000
  Voting type: soft
  Weights: (4, 1, 4) (SVM, Tree, Forest uchun)
  Validatsiya to'plamidagi eng yaxshi aniqlik: 0.91111
  Validatsiya to'plamidagi mos keluvchi precision: 0.91288


In [10]:
if best_voting_clf is not None:
    print("\n--- Eng yaxshi Voting Classifier modelini test to'plamida baholash ---")

    y_pred_best_test = best_voting_clf.predict(X_test)

    acc_v_test = accuracy_score(y_test, y_pred_best_test)
    prec_v_test = precision_score(y_test, y_pred_best_test, average='weighted', zero_division=0)
    rec_v_test = recall_score(y_test, y_pred_best_test, average='weighted', zero_division=0)

    print(f"\nTest to'plamidagi metrikalar (Eng yaxshi Voting Classifier):")
    print(f"accuracy is {acc_v_test:.5f}")
    print(f"precision is {prec_v_test:.5f}")
    print(f"recall is {rec_v_test:.5f}")


--- Eng yaxshi Voting Classifier modelini test to'plamida baholash ---

Test to'plamidagi metrikalar (Eng yaxshi Voting Classifier):
accuracy is 0.90533
precision is 0.90881
recall is 0.90533


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [11]:
best_svm_params = {
    'C': 10,
    'class_weight': None,
    'gamma': 'auto',
    'kernel': 'rbf'
}

base_svm_classifier = SVC(random_state=21, **best_svm_params)


bagging_clf_default = BaggingClassifier(
    base_estimator=base_svm_classifier,
    n_estimators=10,
    random_state=21,
    n_jobs=-1
)

bagging_clf_default.fit(X_train, y_train)

y_pred_bagging_default_valid = bagging_clf_default.predict(X_valid)

acc_b_d = accuracy_score(y_valid, y_pred_bagging_default_valid)
prec_b_d = precision_score(y_valid, y_pred_bagging_default_valid, average='weighted', zero_division=0)
rec_b_d = recall_score(y_valid, y_pred_bagging_default_valid, average='weighted', zero_division=0)

print(f"\nDefault parametrlar bilan Bagging Classifier metrikalari (Validatsiya to'plamida):")
print(f"accuracy is {acc_b_d:.5f}")
print(f"precision is {prec_b_d:.5f}")
print(f"recall is {rec_b_d:.5f}")


Default parametrlar bilan Bagging Classifier metrikalari (Validatsiya to'plamida):
accuracy is 0.87778
precision is 0.88589
recall is 0.87778


In [12]:
best_accuracy_bagging_valid = 0
best_precision_bagging_valid = 0
best_bagging_params = {}
best_bagging_clf = None

n_estimators_options = [50, 100]
max_samples_options = [0.5, 0.7, 1.0]
max_features_options = [0.5, 0.7, 1.0] 

print("\nTurli BaggingClassifier parametrlari bilan sinovlar o'tkazilmoqda...")

for n_est in n_estimators_options:
    for max_samp in max_samples_options:
        for max_feat in max_features_options:
            current_bagging_clf = BaggingClassifier(
                base_estimator=base_svm_classifier,
                n_estimators=n_est,
                max_samples=max_samp,
                max_features=max_feat,
                random_state=21,
                n_jobs=-1
            )
            current_bagging_clf.fit(X_train, y_train)
            y_pred_current_valid = current_bagging_clf.predict(X_valid)

            current_accuracy = accuracy_score(y_valid, y_pred_current_valid)
            current_precision = precision_score(y_valid, y_pred_current_valid, average='weighted', zero_division=0)
            
            # Eng yaxshi modelni tanlash: avval aniqlik bo'yicha, keyin precision bo'yicha
            if current_accuracy > best_accuracy_bagging_valid:
                best_accuracy_bagging_valid = current_accuracy
                best_precision_bagging_valid = current_precision
                best_bagging_params = {
                    'n_estimators': n_est,
                    'max_samples': max_samp,
                    'max_features': max_feat
                }
                best_bagging_clf = current_bagging_clf
            elif current_accuracy == best_accuracy_bagging_valid and current_precision > best_precision_bagging_valid:
                best_precision_bagging_valid = current_precision
                best_bagging_params = {
                    'n_estimators': n_est,
                    'max_samples': max_samp,
                    'max_features': max_feat
                }
                best_bagging_clf = current_bagging_clf

print(f"  Parametrlar: {best_bagging_params}")
print(f"  Validatsiya to'plamidagi eng yaxshi aniqlik: {best_accuracy_bagging_valid:.5f}")
print(f"  Validatsiya to'plamidagi mos keluvchi precision: {best_precision_bagging_valid:.5f}")


Turli BaggingClassifier parametrlari bilan sinovlar o'tkazilmoqda...
  Parametrlar: {'n_estimators': 50, 'max_samples': 1.0, 'max_features': 1.0}
  Validatsiya to'plamidagi eng yaxshi aniqlik: 0.88519
  Validatsiya to'plamidagi mos keluvchi precision: 0.89205


In [13]:
if best_bagging_clf is not None:
    print("\n--- Eng yaxshi Bagging Classifier modelini test to'plamida baholash ---")

    # Test to'plamida bashoratlar qilish
    y_pred_best_bagging_test = best_bagging_clf.predict(X_test)

    # Metrikalarni hisoblash
    acc_b_test = accuracy_score(y_test, y_pred_best_bagging_test)
    prec_b_test = precision_score(y_test, y_pred_best_bagging_test, average='weighted', zero_division=0)
    rec_b_test = recall_score(y_test, y_pred_best_bagging_test, average='weighted', zero_division=0)

    print(f"\nTest to'plamidagi metrikalar (Eng yaxshi Bagging Classifier):")
    print(f"accuracy is {acc_b_test:.5f}")
    print(f"precision is {prec_b_test:.5f}")
    print(f"recall is {rec_b_test:.5f}")


--- Eng yaxshi Bagging Classifier modelini test to'plamida baholash ---

Test to'plamidagi metrikalar (Eng yaxshi Bagging Classifier):
accuracy is 0.87278
precision is 0.87673
recall is 0.87278


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [14]:
estimators_stacking = [
    ('svm', svm_classifier),
    ('tree', tree_classifier),
    ('forest', forest_classifier)
]

# Final estimator (meta-klassifikator)
final_estimator_lr = LogisticRegression(solver='liblinear', random_state=21)

best_accuracy_stacking_valid = 0
best_precision_stacking_valid = 0
best_n_splits = None
best_passthrough = None
best_stacking_clf = None

# Optimallashtirish uchun n_splits qiymatlari
n_splits_options = [2, 3, 4, 5, 6, 7]
passthrough_options = [True, False]

In [15]:
print("Stacking Classifier parametrlari bilan sinovlar o'tkazilmoqda...")

for n_split in n_splits_options:
    cv_strategy = StratifiedKFold(n_splits=n_split, shuffle=True, random_state=21)

    for passthrough_val in passthrough_options:
        current_stacking_clf = StackingClassifier(
            estimators=estimators_stacking,
            final_estimator=final_estimator_lr,
            cv=cv_strategy,
            passthrough=passthrough_val,
            n_jobs=-1
        )

        current_stacking_clf.fit(X_train, y_train)

        y_pred_current_valid = current_stacking_clf.predict(X_valid)

        current_accuracy = accuracy_score(y_valid, y_pred_current_valid)
        current_precision = precision_score(y_valid, y_pred_current_valid, average='weighted', zero_division=0)

        print(f"\n  n_splits={n_split}, passthrough={passthrough_val}:")
        print(f"  accuracy is {current_accuracy:.5f}")
        print(f"  precision is {current_precision:.5f}")
        print(f"  recall is {recall_score(y_valid, y_pred_current_valid, average='weighted', zero_division=0):.5f}")

        # Eng yaxshi modelni tanlash: avval aniqlik bo'yicha, keyin precision bo'yicha
        if current_accuracy > best_accuracy_stacking_valid:
            best_accuracy_stacking_valid = current_accuracy
            best_precision_stacking_valid = current_precision
            best_n_splits = n_split
            best_passthrough = passthrough_val
            best_stacking_clf = current_stacking_clf
        elif current_accuracy == best_accuracy_stacking_valid and current_precision > best_precision_stacking_valid:
            best_precision_stacking_valid = current_precision
            best_n_splits = n_split
            best_passthrough = passthrough_val
            best_stacking_clf = current_stacking_clf

print("\nStacking Classifier parametrlari bilan sinovlar yakunlandi.")
print(f"Eng yaxshi natija quyidagi parametrlar bilan topildi:")
print(f"  n_splits: {best_n_splits}")
print(f"  passthrough: {best_passthrough}")
print(f"  Validatsiya to'plamidagi eng yaxshi aniqlik: {best_accuracy_stacking_valid:.5f}")
print(f"  Validatsiya to'plamidagi mos keluvchi precision: {best_precision_stacking_valid:.5f}")

Stacking Classifier parametrlari bilan sinovlar o'tkazilmoqda...

  n_splits=2, passthrough=True:
  accuracy is 0.90370
  precision is 0.90508
  recall is 0.90370

  n_splits=2, passthrough=False:
  accuracy is 0.89630
  precision is 0.89678
  recall is 0.89630

  n_splits=3, passthrough=True:
  accuracy is 0.90370
  precision is 0.90632
  recall is 0.90370

  n_splits=3, passthrough=False:
  accuracy is 0.89630
  precision is 0.89759
  recall is 0.89630

  n_splits=4, passthrough=True:
  accuracy is 0.91111
  precision is 0.91327
  recall is 0.91111

  n_splits=4, passthrough=False:
  accuracy is 0.90741
  precision is 0.90945
  recall is 0.90741

  n_splits=5, passthrough=True:
  accuracy is 0.90000
  precision is 0.90217
  recall is 0.90000

  n_splits=5, passthrough=False:
  accuracy is 0.90000
  precision is 0.90056
  recall is 0.90000

  n_splits=6, passthrough=True:
  accuracy is 0.90370
  precision is 0.90450
  recall is 0.90370

  n_splits=6, passthrough=False:
  accuracy is 0

In [23]:
if best_stacking_clf is not None:

    y_pred_best_stacking_test = best_stacking_clf.predict(X_test)

    # Metrikalarni hisoblash
    acc_s_test = accuracy_score(y_test, y_pred_best_stacking_test)
    prec_s_test = precision_score(y_test, y_pred_best_stacking_test, average='weighted', zero_division=0)
    rec_s_test = recall_score(y_test, y_pred_best_stacking_test, average='weighted', zero_division=0)

    print(f"accuracy is {acc_s_test:.5f}")
    print(f"precision is {prec_s_test:.5f}")
    print(f"recall is {rec_s_test:.5f}")

accuracy is 0.90533
precision is 0.90844
recall is 0.90533


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [17]:
best_n_splits_for_stacking = 4
best_passthrough_for_stacking = True

best_svm_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
best_tree_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21}
best_forest_params = {'n_estimators': 100, 'max_depth': 24, 'class_weight': 'balanced', 'criterion': 'entropy'}

svm_classifier = SVC(random_state=21, probability=True, **best_svm_params)
tree_classifier = DecisionTreeClassifier(random_state=21, **best_tree_params)
forest_classifier = RandomForestClassifier(random_state=21, **best_forest_params)

In [18]:
estimators_for_stacking = [
    ('svm', svm_classifier),
    ('tree', tree_classifier),
    ('forest', forest_classifier)
]

final_estimator_lr = LogisticRegression(solver='liblinear', random_state=21)

cv_strategy_best = StratifiedKFold(n_splits=best_n_splits_for_stacking, shuffle=True, random_state=21)

best_model = StackingClassifier(
    estimators=estimators_for_stacking,
    final_estimator=final_estimator_lr,
    cv=cv_strategy_best,
    passthrough=best_passthrough_for_stacking,
    n_jobs=-1
)

best_model.fit(X_train_full, y_train_full)

y_pred = best_model.predict(X_test)

# Bashoratlar va haqiqiy qiymatlarni solishtirish
errors_mask = (y_test != y_pred)
errors = y_test[errors_mask] # Noto'g'ri bashorat qilingan y_test qiymatlari
X_test_errors = X_test[errors_mask] # Noto'g'ri bashorat qilingan X_test qatorlari

In [19]:
print("\n1. Haftaning kunlari bo'yicha xatolar:")
total_dayofweek_counts = y_test.value_counts().sort_index()
error_dayofweek_counts = errors.value_counts().sort_index()

for day in total_dayofweek_counts.index:
    total_count = total_dayofweek_counts.get(day, 0)
    error_count = error_dayofweek_counts.get(day, 0)
    if total_count > 0:
        error_percentage = (error_count / total_count) * 100
        print(f"  Day {day}: {error_count} ta xato / {total_count} ta umumiy ({error_percentage:.2f}%)")
    else:
        print(f"  Day {day}: {error_count} ta xato (umumiy namuna yo'q)")

if not error_dayofweek_counts.empty:
    most_errors_day = error_dayofweek_counts.idxmax()
    most_errors_day_percent = (error_dayofweek_counts[most_errors_day] / total_dayofweek_counts[most_errors_day]) * 100
    print(f"\n  Model eng ko'p xatolarni Haftaning kuni **{most_errors_day}** uchun qildi: **{most_errors_day_percent:.2f}%** xato.")


1. Haftaning kunlari bo'yicha xatolar:
  Day 0: 4 ta xato / 27 ta umumiy (14.81%)
  Day 1: 6 ta xato / 55 ta umumiy (10.91%)
  Day 2: 2 ta xato / 30 ta umumiy (6.67%)
  Day 3: 4 ta xato / 80 ta umumiy (5.00%)
  Day 4: 1 ta xato / 21 ta umumiy (4.76%)
  Day 5: 3 ta xato / 54 ta umumiy (5.56%)
  Day 6: 2 ta xato / 71 ta umumiy (2.82%)

  Model eng ko'p xatolarni Haftaning kuni **1** uchun qildi: **10.91%** xato.


In [20]:
labname_cols = [col for col in X.columns if 'labname_' in col]

if not X_test_errors.empty and len(labname_cols) > 0:
    print("\n2. Laboratoriya nomlari bo'yicha xatolar:")
    labname_error_counts = {}
    labname_total_counts = {}

    for lab_col in labname_cols:
        errors_in_lab = X_test_errors[lab_col].sum()
        total_in_lab = X_test[lab_col].sum()

        if errors_in_lab > 0:
            labname_error_counts[lab_col] = errors_in_lab
            labname_total_counts[lab_col] = total_in_lab

    sorted_labname_errors = sorted(labname_error_counts.items(), key=lambda item: item[1], reverse=True)

    if sorted_labname_errors:
        for lab_col, error_count in sorted_labname_errors:
            total_count = labname_total_counts.get(lab_col, 0)
            if total_count > 0:
                error_percentage = (error_count / total_count) * 100
                print(f"  **{lab_col.replace('labname_', '')}**: {int(error_count)} ta xato / {int(total_count)} ta umumiy ({error_percentage:.2f}%)")
            else:
                 print(f"  **{lab_col.replace('labname_', '')}**: {int(error_count)} ta xato (umumiy namuna yo'q)")

        most_errors_labname = sorted_labname_errors[0][0].replace('labname_', '')
        most_errors_labname_percent = (sorted_labname_errors[0][1] / labname_total_counts[sorted_labname_errors[0][0]]) * 100
        print(f"\n  Model eng ko'p xatolarni **'{most_errors_labname}'** laboratoriya nomi uchun qildi: **{most_errors_labname_percent:.2f}%** xato.")


2. Laboratoriya nomlari bo'yicha xatolar:
  **project1**: 7 ta xato / 186 ta umumiy (3.76%)
  **laba04**: 6 ta xato / 35 ta umumiy (17.14%)
  **laba04s**: 3 ta xato / 25 ta umumiy (12.00%)
  **laba06**: 2 ta xato / 9 ta umumiy (22.22%)
  **laba06s**: 2 ta xato / 15 ta umumiy (13.33%)
  **code_rvw**: 1 ta xato / 13 ta umumiy (7.69%)
  **lab05s**: 1 ta xato / 6 ta umumiy (16.67%)

  Model eng ko'p xatolarni **'project1'** laboratoriya nomi uchun qildi: **3.76%** xato.


In [21]:
# 3. Foydalanuvchilar (`users`) bo'yicha xatolar
user_cols = [col for col in X.columns if 'uid_user_' in col]

if not X_test_errors.empty and len(user_cols) > 0:
    print("\n3. Foydalanuvchilar bo'yicha xatolar:")
    user_error_counts = {}
    user_total_counts = {}

    for user_col in user_cols:
        errors_for_user = X_test_errors[user_col].sum()
        total_for_user = X_test[user_col].sum()

        if errors_for_user > 0:
            user_error_counts[user_col] = errors_for_user
            user_total_counts[user_col] = total_for_user

    sorted_user_errors = sorted(user_error_counts.items(), key=lambda item: item[1], reverse=True)

    if sorted_user_errors:
        for user_col, error_count in sorted_user_errors:
            total_count = user_total_counts.get(user_col, 0)
            if total_count > 0:
                error_percentage = (error_count / total_count) * 100
                print(f"  **{user_col.replace('uid_user_', 'User ')}**: {int(error_count)} ta xato / {int(total_count)} ta umumiy ({error_percentage:.2f}%)")
            else:
                print(f"  **{user_col.replace('uid_user_', 'User ')}**: {int(error_count)} ta xato (umumiy namuna yo'q)")

        most_errors_user = sorted_user_errors[0][0].replace('uid_user_', 'User ')
        most_errors_user_percent = (sorted_user_errors[0][1] / user_total_counts[sorted_user_errors[0][0]]) * 100
        print(f"\n  Model eng ko'p xatolarni **'{most_errors_user}'** foydalanuvchisi uchun qildi: **{most_errors_user_percent:.2f}%** xato.")


3. Foydalanuvchilar bo'yicha xatolar:
  **User 19**: 4 ta xato / 19 ta umumiy (21.05%)
  **User 14**: 3 ta xato / 31 ta umumiy (9.68%)
  **User 17**: 2 ta xato / 7 ta umumiy (28.57%)
  **User 3**: 2 ta xato / 14 ta umumiy (14.29%)
  **User 4**: 2 ta xato / 27 ta umumiy (7.41%)
  **User 13**: 1 ta xato / 17 ta umumiy (5.88%)
  **User 18**: 1 ta xato / 6 ta umumiy (16.67%)
  **User 2**: 1 ta xato / 28 ta umumiy (3.57%)
  **User 22**: 1 ta xato / 1 ta umumiy (100.00%)
  **User 24**: 1 ta xato / 11 ta umumiy (9.09%)
  **User 25**: 1 ta xato / 22 ta umumiy (4.55%)
  **User 29**: 1 ta xato / 11 ta umumiy (9.09%)
  **User 30**: 1 ta xato / 8 ta umumiy (12.50%)
  **User 6**: 1 ta xato / 4 ta umumiy (25.00%)

  Model eng ko'p xatolarni **'User 19'** foydalanuvchisi uchun qildi: **21.05%** xato.


In [22]:
model_filename = "best_ensembles_stacking_model.pkl"
joblib.dump(best_model, model_filename)

['best_ensembles_stacking_model.pkl']