# ЛР4 (Случайный лес)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, ParameterGrid, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from collections import Counter
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import resample
import warnings
warnings.filterwarnings("ignore")

# Предобработка данных для задачи классификации

In [2]:
data_classification = pd.read_csv(r'C:\Users\xwxsz\Desktop\Video_games_esrb_rating.csv')

label_encoder = LabelEncoder()
data_classification['title'] = label_encoder.fit_transform(data_classification['title'])
data_classification['esrb_rating'] = label_encoder.fit_transform(data_classification['esrb_rating'])

X = data_classification.drop('esrb_rating', axis=1) 
y = data_classification['esrb_rating'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Классификация (бейзлайн)

In [3]:
# Инициализация и обучение модели случайного леса
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Предикт
y_pred = rf.predict(X_test)

# Оценка
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Для многоклассовой классификации используем 'weighted'

print(f'Accuracy: {accuracy}')
print(f'F1-Score: {f1}')

Accuracy: 0.8443271767810027
F1-Score: 0.845342626435832


# Классификация (улучшенный бейзлайн)

In [4]:
# Инициализация модели случайного леса
rf = RandomForestClassifier(random_state=42)

# Параметры для поиска оптимальных гиперпараметров
param_grid = {
    'n_estimators': [100, 200, 300],  # Количество деревьев
    'max_depth': [None, 10, 20, 30],  # Глубина деревьев
    'max_features': ['auto', 'sqrt', 'log2'],  # Количество признаков для разделения
    'class_weight': [None, 'balanced'],  # Для борьбы с несбалансированными классами
}

# Используем GridSearchCV для поиска оптимальных гиперпараметров
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Лучшие параметры
print(f"{grid_search.best_params_}")

# Предикт
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Оценка
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1-Score: {f1}')

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'class_weight': 'balanced', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 200}
Accuracy: 0.8522427440633246
F1-Score: 0.8528096577802824


# Классификация (самостоятельная имплементация)

# Предобработка данных для задачи регрессии

In [5]:
data_regression = pd.read_excel(r'C:\Users\xwxsz\Desktop\Real estate valuation data set.xlsx')
X = data_regression.drop(columns=['Y house price of unit area'])
y = data_regression['Y house price of unit area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Регрессия (бейзлайн)

In [6]:
# Создание модели случайного леса
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Предикт
y_pred = rf.predict(X_test)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'R^2: {r2}')

MAE: 3.8787951807228906
R^2: 0.8101253381007657


# Регрессия (улучшенный бейзлайн)

In [7]:
# Определение модели случайного леса
rf = RandomForestRegressor(random_state=42)

# Параметры для поиска
param_grid = {
    'n_estimators': [100, 200, 300],  # Количество деревьев
    'max_depth': [None, 10, 20, 30],  # Максимальная глубина дерева
    'min_samples_split': [2, 5, 10],  # Минимальное количество образцов для разделения
    'min_samples_leaf': [1, 2, 4],  # Минимальное количество образцов в листьях
    'max_features': ['auto', 'sqrt']  # Количество признаков для использования на каждом узле
}

# Поиск оптимальных гиперпараметров
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=0, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Лучшая модель
print(f'{grid_search.best_params_}')
best_rf = grid_search.best_estimator_

# Предикт
y_pred = best_rf.predict(X_test)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'R²: {r2}')

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
MAE: 3.6110411685080823
R²: 0.8291237493563459


# Регрессия (самостоятельная имплементация)

In [8]:
class CustomRandomForestRegressor:
    def __init__(self, n_estimators=100, max_features="sqrt", max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.random_state = random_state
        self.trees = []
        self.features_indices = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

    def _get_max_features(self, n_features):
        if self.max_features == "sqrt":
            return max(1, int(np.sqrt(n_features)))
        elif self.max_features == "log2":
            return max(1, int(np.log2(n_features)))
        elif isinstance(self.max_features, int):
            return self.max_features
        else:
            return n_features

    def fit(self, X, y):
        np.random.seed(self.random_state)
        self.trees = []
        self.features_indices = []

        # Преобразование DataFrame в NumPy массив
        X = X.values if hasattr(X, 'values') else X
        y = y.values if hasattr(y, 'values') else y

        n_features = X.shape[1]
        max_features = self._get_max_features(n_features)

        for _ in range(self.n_estimators):
            X_sample, y_sample = self._bootstrap_sample(X, y)

            feature_indices = np.random.choice(n_features, max_features, replace=False)
            self.features_indices.append(feature_indices)

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X_sample[:, feature_indices], y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Преобразование DataFrame в NumPy массив
        X = X.values if hasattr(X, 'values') else X

        tree_predictions = np.array([
            tree.predict(X[:, feature_indices]) for tree, feature_indices in zip(self.trees, self.features_indices)
        ])
        return tree_predictions.mean(axis=0)

rf = CustomRandomForestRegressor(n_estimators=100, max_depth=10)
rf.fit(X_train, y_train)

# Предикт
y_pred = rf.predict(X_test)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE (до улучшения): {mae}')
print(f'R^2 (до улучшения): {r2}')


MAE (до улучшения): 4.508121185485079
R^2 (до улучшения): 0.773731657426662


In [9]:
class CustomRandomForestRegressor:
    def __init__(self, n_estimators=100, max_features="sqrt", max_depth=None, random_state=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth
        self.random_state = random_state
        self.trees = []
        self.features_indices = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

    def _get_max_features(self, n_features):
        if self.max_features == "sqrt":
            return max(1, int(np.sqrt(n_features)))
        elif self.max_features == "log2":
            return max(1, int(np.log2(n_features)))
        elif isinstance(self.max_features, int):
            return self.max_features
        else:
            return n_features

    def fit(self, X, y):
        np.random.seed(self.random_state)
        self.trees = []
        self.features_indices = []

        X = X.values if hasattr(X, 'values') else X
        y = y.values if hasattr(y, 'values') else y

        n_features = X.shape[1]
        max_features = self._get_max_features(n_features)

        for _ in range(self.n_estimators):
            X_sample, y_sample = self._bootstrap_sample(X, y)

            feature_indices = np.random.choice(n_features, max_features, replace=False)
            self.features_indices.append(feature_indices)

            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X_sample[:, feature_indices], y_sample)
            self.trees.append(tree)

    def predict(self, X):
        X = X.values if hasattr(X, 'values') else X
        tree_predictions = np.array([
            tree.predict(X[:, feature_indices]) for tree, feature_indices in zip(self.trees, self.features_indices)
        ])
        return tree_predictions.mean(axis=0)

    def score(self, X, y):
        y_pred = self.predict(X)
        return r2_score(y, y_pred)

    # Добавление метода get_params
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'max_features': self.max_features,
            'max_depth': self.max_depth,
            'random_state': self.random_state
        }
    
    # Добавление метода set_params
    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

In [10]:
# Подбор гиперпараметров с использованием RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 15, 20],
    'max_features': ['sqrt', 'log2', 5, 10]
}

# Создание модели случайного леса
rf = CustomRandomForestRegressor(random_state=42)

# Обучение модели с подбором гиперпараметров
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=10, cv=5, scoring='r2', random_state=42)
random_search.fit(X_train, y_train)

# Лучшие параметры
print(random_search.best_params_)

# Предикт
y_pred = random_search.best_estimator_.predict(X_test)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE (после улучшения): {mae}')
print(f'R^2 (после улучшения): {r2}')

{'n_estimators': 150, 'max_features': 5, 'max_depth': None}
MAE (после улучшения): 3.694038063364568
R^2 (после улучшения): 0.8094543680267501
