# Лабораторная работа №4 - Random Fores


In [19]:
import os
import glob
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import matplotlib.pyplot as plt

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


## Загрузка датасетов и подготовка данных



In [20]:
def _find_csv_files(path: str):
    return sorted(glob.glob(os.path.join(path, "**", "*.csv"), recursive=True))

def load_kaggle_dataset(handle: str, local_fallback_dir: str = "data"):
    try:
        import kagglehub
        ds_path = kagglehub.dataset_download(handle)
        csvs = _find_csv_files(ds_path)
        return ds_path, csvs
    except Exception as e:
        os.makedirs(local_fallback_dir, exist_ok=True)
        csvs = _find_csv_files(local_fallback_dir)
        return local_fallback_dir, csvs

cls_handle = "uciml/faulty-steel-plates"
cls_path, cls_csvs = load_kaggle_dataset(cls_handle)

df_cls = pd.read_csv(cls_csvs[0])
print("df_cls shape:", df_cls.shape)
df_cls.head()


df_cls shape: (1941, 34)


Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.8182,-0.2913,0.5822,1,0,0,0,0,0,0
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,0.9338,-0.1992,1.0,1,0,0,0,0,0,0


### Формирование X и y для классификации

В датасете Steel Plates Faults целевая переменная часто представлена либо:
- одним столбцом с названием класса, либо
- набором столбцов (7 колонок по классам).

Код ниже делает вектор меток классов.

In [21]:
CLASS_COLS = ["Other_Faults","Bumps","K_Scatch","Z_Scratch","Pastry","Stains","Dirtiness"]

possible_label_cols = [c for c in df_cls.columns if c.lower() in ["class", "label", "target", "fault"]]

if possible_label_cols:
    label_col = possible_label_cols[0]
    y_cls = df_cls[label_col].astype(str)
    X_cls = df_cls.drop(columns=[label_col])
else:
    if all(c in df_cls.columns for c in CLASS_COLS):
        y_cls = df_cls[CLASS_COLS].idxmax(axis=1)
        X_cls = df_cls.drop(columns=CLASS_COLS)
    else:
        raise ValueError(
            "Не удалось определить целевую переменную. "
            "Ожидаем либо колонку 'class/label/target', либо one-hot колонки классов: " + str(CLASS_COLS)
        )

X_cls = X_cls.apply(pd.to_numeric, errors="coerce")

print("X_cls:", X_cls.shape, "y_cls:", y_cls.shape)
print("Пропуски в X_cls:", int(X_cls.isna().sum().sum()))
print("Классов:", y_cls.nunique())
y_cls.value_counts()


X_cls: (1941, 27) y_cls: (1941,)
Пропуски в X_cls: 0
Классов: 7


Other_Faults    673
Bumps           402
K_Scatch        391
Z_Scratch       190
Pastry          158
Stains           72
Dirtiness        55
Name: count, dtype: int64

### Регрессия: Concrete Compressive Strength

In [22]:
reg_handle = "elikplim/concrete-compressive-strength-data-set"
reg_path, reg_csvs = load_kaggle_dataset(reg_handle)

df_reg = pd.read_csv(reg_csvs[0])
print("df_reg shape:", df_reg.shape)
df_reg.head()


df_reg shape: (1030, 9)


Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


### Формирование X и y для регрессии

Целевая - прочность бетона. Если колонка называется не так, мы ищем самый похожий вариант по имени.

In [23]:
cands = [c for c in df_reg.columns if "strength" in c.lower()]
if not cands:
    target_reg = df_reg.columns[-1]
else:
    target_reg = cands[0]

y_reg = pd.to_numeric(df_reg[target_reg], errors="coerce")
X_reg = df_reg.drop(columns=[target_reg]).apply(pd.to_numeric, errors="coerce")

print("target_reg:", target_reg)
print("X_reg:", X_reg.shape, "y_reg:", y_reg.shape)
print("Пропуски в X_reg:", int(X_reg.isna().sum().sum()))
print("Пропуски в y_reg:", int(y_reg.isna().sum()))
y_reg.describe()


target_reg: concrete_compressive_strength
X_reg: (1030, 8) y_reg: (1030,)
Пропуски в X_reg: 0
Пропуски в y_reg: 0


count    1030.000000
mean       35.817961
std        16.705742
min         2.330000
25%        23.710000
50%        34.445000
75%        46.135000
max        82.600000
Name: concrete_compressive_strength, dtype: float64

## Train/Test split

In [24]:
mask_cls = np.isfinite(X_cls.values).all(axis=1) & y_cls.notna().values
Xc = X_cls.loc[mask_cls].values
yc = y_cls.loc[mask_cls].values

mask_reg = np.isfinite(X_reg.values).all(axis=1) & np.isfinite(y_reg.values)
Xr = X_reg.loc[mask_reg].values
yr = y_reg.loc[mask_reg].values

Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    Xc, yc, test_size=0.2, random_state=RANDOM_STATE, stratify=yc
)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    Xr, yr, test_size=0.2, random_state=RANDOM_STATE
)

print("  Classification train/test:", Xc_train.shape, Xc_test.shape)
print("  Regression train/test:    ", Xr_train.shape, Xr_test.shape)


  Classification train/test: (1552, 27) (389, 27)
  Regression train/test:     (824, 8) (206, 8)


## Бейзлайн: RandomForest (sklearn) + оценка качества

### Классификация

In [25]:
rf_cls_base = RandomForestClassifier(
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_cls_base.fit(Xc_train, yc_train)
yc_pred_base = rf_cls_base.predict(Xc_test)

acc_base = accuracy_score(yc_test, yc_pred_base)
f1m_base = f1_score(yc_test, yc_pred_base, average="macro")

print("Baseline RandomForestClassifier")
print("  Accuracy :", round(acc_base, 4))
print("  Macro F1 :", round(f1m_base, 4))
print("Classification report:")
print(classification_report(yc_test, yc_pred_base, zero_division=0))


Baseline RandomForestClassifier
  Accuracy : 0.8021
  Macro F1 : 0.8117
Classification report:
              precision    recall  f1-score   support

       Bumps       0.74      0.70      0.72        81
   Dirtiness       0.90      0.82      0.86        11
    K_Scatch       0.97      0.91      0.94        78
Other_Faults       0.72      0.84      0.78       135
      Pastry       0.70      0.50      0.58        32
      Stains       0.92      0.86      0.89        14
   Z_Scratch       0.97      0.87      0.92        38

    accuracy                           0.80       389
   macro avg       0.85      0.79      0.81       389
weighted avg       0.81      0.80      0.80       389



### Регрессия

In [26]:
rf_reg_base = RandomForestRegressor(
    n_estimators=500,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_reg_base.fit(Xr_train, yr_train)
yr_pred_base = rf_reg_base.predict(Xr_test)

mae_base = mean_absolute_error(yr_test, yr_pred_base)
rmse_base = np.sqrt(mean_squared_error(yr_test, yr_pred_base))
r2_base = r2_score(yr_test, yr_pred_base)

print("Baseline RandomForestRegressor")
print("  MAE  :", round(mae_base, 4))
print("  RMSE :", round(rmse_base, 4))
print("  R2   :", round(r2_base, 4))


Baseline RandomForestRegressor
  MAE  : 3.7693
  RMSE : 5.4908
  R2   : 0.883


## Улучшение бейзлайна: гипотезы и проверка

Для RandomForest масштабирование обычно не нужно, поэтому делаем упор на:
- подбор глубины и ограничений (`max_depth`, `min_samples_leaf`),
- количество деревьев (`n_estimators`),
- случайный выбор признаков (`max_features`),
- учёт дисбаланса классов (`class_weight`).

Используем RandomizedSearchCV.

### Подбор гиперпараметров для классификации

In [27]:
from scipy.stats import randint

cv_cls = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

param_dist_cls = {
    "n_estimators": randint(200, 801),
    "max_depth": [None, 5, 8, 12, 16, 20],
    "min_samples_split": randint(2, 21),
    "min_samples_leaf": randint(1, 11),
    "max_features": ["sqrt", "log2", None],
    "class_weight": [None, "balanced", "balanced_subsample"]
}

search_cls = RandomizedSearchCV(
    RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    param_distributions=param_dist_cls,
    n_iter=30,
    cv=cv_cls,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=RANDOM_STATE
)

search_cls.fit(Xc_train, yc_train)

print("Best CV params (classification):", search_cls.best_params_)
print("Best CV macro F1:", round(search_cls.best_score_, 4))

yc_pred_impr = search_cls.predict(Xc_test)
acc_impr = accuracy_score(yc_test, yc_pred_impr)
f1m_impr = f1_score(yc_test, yc_pred_impr, average="macro")

print("Improved RandomForest (classification) on test")
print("  Accuracy :", round(acc_impr, 4))
print("  Macro F1 :", round(f1m_impr, 4))


Best CV params (classification): {'class_weight': 'balanced', 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 11, 'n_estimators': 675}
Best CV macro F1: 0.7884
Improved RandomForest (classification) on test
  Accuracy : 0.7892
  Macro F1 : 0.8213


### Подбор гиперпараметров для регрессии

In [28]:
cv_reg = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

param_dist_reg = {
    "n_estimators": randint(300, 1201),
    "max_depth": [None, 5, 8, 12, 16, 20],
    "min_samples_split": randint(2, 21),
    "min_samples_leaf": randint(1, 11),
    "max_features": ["sqrt", "log2", 1.0],
}

search_reg = RandomizedSearchCV(
    RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1),
    param_distributions=param_dist_reg,
    n_iter=30,
    cv=cv_reg,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    random_state=RANDOM_STATE
)

search_reg.fit(Xr_train, yr_train)

print("Best CV params (regression):", search_reg.best_params_)
print("Best CV MAE (approx):", round(-search_reg.best_score_, 4))

yr_pred_impr = search_reg.predict(Xr_test)
mae_impr = mean_absolute_error(yr_test, yr_pred_impr)
rmse_impr = np.sqrt(mean_squared_error(yr_test, yr_pred_impr))
r2_impr = r2_score(yr_test, yr_pred_impr)

print("Improved RandomForest (regression) on test")
print("  MAE  :", round(mae_impr, 4))
print("  RMSE :", round(rmse_impr, 4))
print("  R2   :", round(r2_impr, 4))


Best CV params (regression): {'max_depth': 16, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 820}
Best CV MAE (approx): 3.9966
Improved RandomForest (regression) on test
  MAE  : 4.1539
  RMSE : 5.7607
  R2   : 0.8712


### Сравнение бейзлайна и улучшенной модели

In [29]:
print("=== Classification (test) ===")
print("Baseline: acc =", round(acc_base, 4), "macro_f1 =", round(f1m_base, 4))
print("Improved: acc =", round(acc_impr, 4), "macro_f1 =", round(f1m_impr, 4))

print("=== Regression (test) ===")
print("Baseline: MAE =", round(mae_base, 4), "RMSE =", round(rmse_base, 4), "R2 =", round(r2_base, 4))
print("Improved: MAE =", round(mae_impr, 4), "RMSE =", round(rmse_impr, 4), "R2 =", round(r2_impr, 4))


=== Classification (test) ===
Baseline: acc = 0.8021 macro_f1 = 0.8117
Improved: acc = 0.7892 macro_f1 = 0.8213
=== Regression (test) ===
Baseline: MAE = 3.7693 RMSE = 5.4908 R2 = 0.883
Improved: MAE = 4.1539 RMSE = 5.7607 R2 = 0.8712


## Моя реализация дерева + случайный лес

Ниже реализован упрощённый вариант CART-дерева и случайного леса:
- признаки числовые;
- для ускорения используется ограничение `n_thresholds`
- случайный выбор признаков делается **на каждом разбиении**

In [30]:
from dataclasses import dataclass
from typing import Optional, Tuple, Any, List

@dataclass
class _Node:
    feature: Optional[int] = None
    threshold: Optional[float] = None
    left: Optional[Any] = None
    right: Optional[Any] = None
    value: Optional[Any] = None  

def _gini(y: np.ndarray) -> float:
    _, counts = np.unique(y, return_counts=True)
    p = counts / counts.sum()
    return 1.0 - np.sum(p**2)

def _mse(y: np.ndarray) -> float:
    if y.size == 0:
        return 0.0
    return float(np.mean((y - y.mean())**2))

class MyDecisionTree:
    def __init__(self, task: str, max_depth: int = 10, min_samples_split: int = 2,
                 min_samples_leaf: int = 1, max_features: Optional[Any] = None,
                 n_thresholds: int = 16, random_state: int = 42):
        assert task in ["classification", "regression"]
        self.task = task
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.n_thresholds = n_thresholds
        self.random_state = random_state
        self.root = None
        self.n_features_ = None

    def _resolve_max_features(self, n_features: int) -> int:
        mf = self.max_features
        if mf is None:
            return n_features
        if isinstance(mf, str):
            if mf == "sqrt":
                return max(1, int(np.sqrt(n_features)))
            if mf == "log2":
                return max(1, int(np.log2(n_features)))
            raise ValueError("Unknown max_features string")
        if isinstance(mf, float):
            return max(1, int(round(mf * n_features)))
        if isinstance(mf, int):
            return max(1, min(mf, n_features))
        return n_features

    def fit(self, X: np.ndarray, y: np.ndarray):
        rng = np.random.RandomState(self.random_state)
        self.n_features_ = X.shape[1]
        self.root = self._build(X, y, depth=0, rng=rng)
        return self

    def _leaf_value(self, y: np.ndarray):
        if self.task == "classification":
            vals, counts = np.unique(y, return_counts=True)
            return int(vals[np.argmax(counts)])
        else:
            return float(np.mean(y))

    def _best_split(self, X: np.ndarray, y: np.ndarray, rng: np.random.RandomState) -> Tuple[Optional[int], Optional[float], float]:
        n_samples, n_features = X.shape
        if n_samples < self.min_samples_split:
            return None, None, np.inf

        mf = self._resolve_max_features(n_features)
        feat_idx = rng.choice(n_features, size=mf, replace=False)

        best_feat, best_thr = None, None
        best_score = np.inf

        if self.task == "classification":
            impurity = _gini
        else:
            impurity = _mse

        parent_imp = impurity(y)

        for f in feat_idx:
            col = X[:, f]
            qs = np.linspace(0.05, 0.95, self.n_thresholds)
            thrs = np.unique(np.quantile(col, qs))
            for thr in thrs:
                left_mask = col <= thr
                right_mask = ~left_mask

                if left_mask.sum() < self.min_samples_leaf or right_mask.sum() < self.min_samples_leaf:
                    continue

                yL, yR = y[left_mask], y[right_mask]
                score = (yL.size * impurity(yL) + yR.size * impurity(yR)) / n_samples

                if score < best_score:
                    best_score = score
                    best_feat = f
                    best_thr = float(thr)

        if best_feat is None or best_score >= parent_imp:
            return None, None, np.inf

        return best_feat, best_thr, best_score

    def _build(self, X: np.ndarray, y: np.ndarray, depth: int, rng: np.random.RandomState) -> _Node:
        node = _Node()

        if depth >= self.max_depth or X.shape[0] < self.min_samples_split:
            node.value = self._leaf_value(y)
            return node

        if self.task == "classification" and np.unique(y).size == 1:
            node.value = int(y[0])
            return node

        feat, thr, score = self._best_split(X, y, rng)
        if feat is None:
            node.value = self._leaf_value(y)
            return node

        node.feature = feat
        node.threshold = thr

        left_mask = X[:, feat] <= thr
        right_mask = ~left_mask

        node.left = self._build(X[left_mask], y[left_mask], depth + 1, rng)
        node.right = self._build(X[right_mask], y[right_mask], depth + 1, rng)
        return node

    def _predict_one(self, x: np.ndarray) -> Any:
        node = self.root
        while node.value is None:
            if x[node.feature] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node.value

    def predict(self, X: np.ndarray) -> np.ndarray:
        return np.array([self._predict_one(x) for x in X])

class MyRandomForest:
    def __init__(self, task: str, n_estimators: int = 50, max_depth: int = 10,
                 min_samples_split: int = 2, min_samples_leaf: int = 1,
                 max_features: Optional[Any] = "sqrt", n_thresholds: int = 16,
                 bootstrap: bool = True, random_state: int = 42):
        assert task in ["classification", "regression"]
        self.task = task
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.n_thresholds = n_thresholds
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.trees: List[MyDecisionTree] = []
        self.classes_ = None  

    def fit(self, X: np.ndarray, y: np.ndarray):
        rng = np.random.RandomState(self.random_state)
        n = X.shape[0]

        if self.task == "classification":
            self.classes_, y_enc = np.unique(y, return_inverse=True)
            y_use = y_enc
        else:
            y_use = y.astype(float)

        self.trees = []
        for i in range(self.n_estimators):
            if self.bootstrap:
                idx = rng.randint(0, n, size=n)
            else:
                idx = np.arange(n)

            tree = MyDecisionTree(
                task=self.task,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=self.max_features,
                n_thresholds=self.n_thresholds,
                random_state=int(rng.randint(0, 1_000_000))
            )
            tree.fit(X[idx], y_use[idx])
            self.trees.append(tree)

        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        preds = np.stack([t.predict(X) for t in self.trees], axis=0) 

        if self.task == "classification":
            out = []
            for j in range(preds.shape[1]):
                vals, counts = np.unique(preds[:, j], return_counts=True)
                out.append(int(vals[np.argmax(counts)]))
            out = np.array(out)
            return self.classes_[out]
        else:
            return preds.mean(axis=0)


### Самописный RandomForest — бейзлайн и сравнение со sklearn

In [31]:
my_rf_cls_base = MyRandomForest(
    task="classification",
    n_estimators=80,
    max_depth=12,
    max_features="sqrt",
    n_thresholds=16,
    random_state=RANDOM_STATE
)
my_rf_cls_base.fit(Xc_train, yc_train)
yc_pred_my_base = my_rf_cls_base.predict(Xc_test)

acc_my_base = accuracy_score(yc_test, yc_pred_my_base)
f1m_my_base = f1_score(yc_test, yc_pred_my_base, average="macro")

print("Самописный RandomForest (классификация) — baseline")
print("  Accuracy :", round(acc_my_base, 4))
print("  Macro F1 :", round(f1m_my_base, 4))

my_rf_reg_base = MyRandomForest(
    task="regression",
    n_estimators=120,
    max_depth=16,
    max_features=1.0,
    n_thresholds=16,
    random_state=RANDOM_STATE
)
my_rf_reg_base.fit(Xr_train, yr_train)
yr_pred_my_base = my_rf_reg_base.predict(Xr_test)

mae_my_base = mean_absolute_error(yr_test, yr_pred_my_base)
rmse_my_base = np.sqrt(mean_squared_error(yr_test, yr_pred_my_base))
r2_my_base = r2_score(yr_test, yr_pred_my_base)

print("Самописный RandomForest (регрессия) — baseline")
print("  MAE  :", round(mae_my_base, 4))
print("  RMSE :", round(rmse_my_base, 4))
print("  R2   :", round(r2_my_base, 4))


Самописный RandomForest (классификация) — baseline
  Accuracy : 0.7841
  Macro F1 : 0.787
Самописный RandomForest (регрессия) — baseline
  MAE  : 3.7609
  RMSE : 5.4524
  R2   : 0.8846


### Моя реализация RandomForest + техники улучшенного бейзлайна

Берём лучшие гиперпараметры из `search_cls.best_params_` и `search_reg.best_params_` и переносим их (частично) в самописную модель.


In [32]:
best_cls = search_cls.best_params_
best_reg = search_reg.best_params_

print("Best (classification):", best_cls)
print("Best (regression):    ", best_reg)

my_rf_cls_impr = MyRandomForest(
    task="classification",
    n_estimators=int(best_cls.get("n_estimators", 300)),
    max_depth=best_cls.get("max_depth", None) or 20,   
    min_samples_split=int(best_cls.get("min_samples_split", 2)),
    min_samples_leaf=int(best_cls.get("min_samples_leaf", 1)),
    max_features=best_cls.get("max_features", "sqrt"),
    n_thresholds=16,
    random_state=RANDOM_STATE
)
my_rf_cls_impr.fit(Xc_train, yc_train)
yc_pred_my_impr = my_rf_cls_impr.predict(Xc_test)

acc_my_impr = accuracy_score(yc_test, yc_pred_my_impr)
f1m_my_impr = f1_score(yc_test, yc_pred_my_impr, average="macro")

my_rf_reg_impr = MyRandomForest(
    task="regression",
    n_estimators=int(best_reg.get("n_estimators", 500)),
    max_depth=best_reg.get("max_depth", None) or 20,
    min_samples_split=int(best_reg.get("min_samples_split", 2)),
    min_samples_leaf=int(best_reg.get("min_samples_leaf", 1)),
    max_features=best_reg.get("max_features", 1.0),
    n_thresholds=16,
    random_state=RANDOM_STATE
)
my_rf_reg_impr.fit(Xr_train, yr_train)
yr_pred_my_impr = my_rf_reg_impr.predict(Xr_test)

mae_my_impr = mean_absolute_error(yr_test, yr_pred_my_impr)
rmse_my_impr = np.sqrt(mean_squared_error(yr_test, yr_pred_my_impr))
r2_my_impr = r2_score(yr_test, yr_pred_my_impr)

print("Самописный RandomForest + лучшие гиперпараметры (TEST)")
print("  Классификация: accuracy =", round(acc_my_impr, 4), "macro_f1 =", round(f1m_my_impr, 4))
print("  Регрессия:     MAE =", round(mae_my_impr, 4), "RMSE =", round(rmse_my_impr, 4), "R2 =", round(r2_my_impr, 4))


Best (classification): {'class_weight': 'balanced', 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 11, 'n_estimators': 675}
Best (regression):     {'max_depth': 16, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 820}
Самописный RandomForest + лучшие гиперпараметры (TEST)
  Классификация: accuracy = 0.7918 macro_f1 = 0.7917
  Регрессия:     MAE = 3.8951 RMSE = 5.4721 R2 = 0.8838


## Итоговое сравнение

In [33]:
print("Итог classification")
print("sklearn baseline:  acc =", round(acc_base, 4), "macro_f1 =", round(f1m_base, 4))
print("sklearn improved:  acc =", round(acc_impr, 4), "macro_f1 =", round(f1m_impr, 4))
print("custom  baseline:  acc =", round(acc_my_base, 4), "macro_f1 =", round(f1m_my_base, 4))
print("custom  improved:  acc =", round(acc_my_impr, 4), "macro_f1 =", round(f1m_my_impr, 4))

print("Итог (regression")
print("sklearn baseline:  MAE =", round(mae_base, 4), "RMSE =", round(rmse_base, 4), "R2 =", round(r2_base, 4))
print("sklearn improved:  MAE =", round(mae_impr, 4), "RMSE =", round(rmse_impr, 4), "R2 =", round(r2_impr, 4))
print("custom  baseline:  MAE =", round(mae_my_base, 4), "RMSE =", round(rmse_my_base, 4), "R2 =", round(r2_my_base, 4))
print("custom  improved:  MAE =", round(mae_my_impr, 4), "RMSE =", round(rmse_my_impr, 4), "R2 =", round(r2_my_impr, 4))


Итог classification
sklearn baseline:  acc = 0.8021 macro_f1 = 0.8117
sklearn improved:  acc = 0.7892 macro_f1 = 0.8213
custom  baseline:  acc = 0.7841 macro_f1 = 0.787
custom  improved:  acc = 0.7918 macro_f1 = 0.7917
Итог (regression
sklearn baseline:  MAE = 3.7693 RMSE = 5.4908 R2 = 0.883
sklearn improved:  MAE = 4.1539 RMSE = 5.7607 R2 = 0.8712
custom  baseline:  MAE = 3.7609 RMSE = 5.4524 R2 = 0.8846
custom  improved:  MAE = 3.8951 RMSE = 5.4721 R2 = 0.8838
