# ЛР5 (Градиентный бустинг)

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, ParameterGrid, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import resample
import warnings
warnings.filterwarnings("ignore")

# Предобработка данных для задачи классификации

In [3]:
data_classification = pd.read_csv(r'C:\Users\xwxsz\Desktop\Video_games_esrb_rating.csv')

label_encoder = LabelEncoder()
data_classification['title'] = label_encoder.fit_transform(data_classification['title'])
data_classification['esrb_rating'] = label_encoder.fit_transform(data_classification['esrb_rating'])

X = data_classification.drop('esrb_rating', axis=1) 
y = data_classification['esrb_rating'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Классификация (бейзлайн)

In [9]:
# Бейзлайн: предсказание наиболее частого класса
most_common_class = Counter(y_train).most_common(1)[0][0]
y_pred_baseline = np.full_like(y_test, fill_value=most_common_class)

# Градиентный бустинг
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_f1 = f1_score(y_test, y_pred_gb, average='weighted')

print(f"Gradient Boosting Accuracy: {gb_accuracy}")
print(f"Gradient Boosting F1-score: {gb_f1}")

Gradient Boosting Accuracy: 0.862796833773087
Gradient Boosting F1-score: 0.8645957756957025


# Классификация (улучшеный бейзлайн)

In [13]:
# Загрузка данных
data_classification = pd.read_csv(r'C:\Users\xwxsz\Desktop\Video_games_esrb_rating.csv')

# Предобработка данных
label_encoder = LabelEncoder()
data_classification['title'] = label_encoder.fit_transform(data_classification['title'])
data_classification['esrb_rating'] = label_encoder.fit_transform(data_classification['esrb_rating'])

X = data_classification.drop('esrb_rating', axis=1).values
y = data_classification['esrb_rating'].values

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Бейзлайн: предсказание наиболее частого класса
most_common_class = Counter(y_train).most_common(1)[0][0]
y_pred_baseline = np.full_like(y_test, fill_value=most_common_class)

baseline_accuracy = accuracy_score(y_test, y_pred_baseline)

# Улучшение модели с помощью Градиентного Бустинга и гиперпараметрической настройки
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("gb", GradientBoostingClassifier(random_state=42))
])

param_grid = {
    "gb__n_estimators": [100, 200, 300],
    "gb__learning_rate": [0.01, 0.1, 0.2],
    "gb__max_depth": [3, 4, 5],
    "gb__subsample": [0.8, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Лучшая модель
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Оценка улучшенной модели
best_accuracy = accuracy_score(y_test, y_pred_best)
best_f1 = f1_score(y_test, y_pred_best, average='weighted')

print(f"Accuracy: {best_accuracy}")
print(f"F1-score: {best_f1}")

Accuracy: 0.8575197889182058
F1-score: 0.857851122513756


# Классификация (самостоятельная имплементация)

In [14]:
# Самостоятельная реализация градиентного бустинга для классификации
class GradientBoostingClassifierCustom:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.classes = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        y_encoded = self._one_hot_encode(y)
        
        # Инициализация предсказаний вероятностей равномерными значениями
        predictions = np.zeros_like(y_encoded, dtype=np.float64)

        for _ in range(self.n_estimators):
            residuals = y_encoded - self._softmax(predictions)
            model = []

            for k in range(len(self.classes)):
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X, residuals[:, k])
                model.append(tree)
                predictions[:, k] += self.learning_rate * tree.predict(X)

            self.models.append(model)

    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.classes)), dtype=np.float64)

        for model in self.models:
            for k, tree in enumerate(model):
                predictions[:, k] += self.learning_rate * tree.predict(X)

        return np.argmax(self._softmax(predictions), axis=1)

    def _one_hot_encode(self, y):
        encoded = np.zeros((len(y), len(self.classes)))
        for i, label in enumerate(y):
            encoded[i, label] = 1
        return encoded

    def _softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [15]:
# Обучение собственной реализации градиентного бустинга
custom_gb = GradientBoostingClassifierCustom(n_estimators=100, learning_rate=0.1, max_depth=3)
custom_gb.fit(X_train, y_train)

# Предсказание
y_pred_custom = custom_gb.predict(X_test)

# Оценка модели
custom_accuracy = accuracy_score(y_test, y_pred_custom)
custom_f1 = f1_score(y_test, y_pred_custom, average='weighted')

print("\nCustom Gradient Boosting Metrics:")
print(f"Accuracy: {custom_accuracy}")
print(f"F1-score: {custom_f1}")


Custom Gradient Boosting Metrics:
Accuracy: 0.8258575197889182
F1-score: 0.8283595364212384


# Предобработка данных для задачи регрессии

In [16]:
data_regression = pd.read_excel(r'C:\Users\xwxsz\Desktop\Real estate valuation data set.xlsx')
X = data_regression.drop(columns=['Y house price of unit area'])
y = data_regression['Y house price of unit area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Регрессия (бейзлайн)

In [19]:
# Бейзлайн: среднее значение обучающей выборки
baseline_pred = np.full_like(y_test, y_train.mean())

# Градиентный бустинг
gb_model = GradientBoostingRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_r2 = r2_score(y_test, y_pred_gb)

print(f"MAE: {gb_mae}")
print(f"R2: {gb_r2}")

MAE: 4.083143821562733
R2: 0.7799860116073392


# Регрессия (улучшенный бейзлайн)

In [20]:
# Бейзлайн: среднее значение обучающей выборки
baseline_pred = np.full_like(y_test, y_train.mean())

# Оптимизация гиперпараметров для градиентного бустинга
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'min_samples_split': [2, 5, 10],
}

gb_model = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Лучшая модель
best_gb_model = grid_search.best_estimator_
y_pred_best_gb = best_gb_model.predict(X_test)

best_gb_mae = mean_absolute_error(y_test, y_pred_best_gb)
best_gb_r2 = r2_score(y_test, y_pred_best_gb)

print(f"MAE: {best_gb_mae}")
print(f"R2: {best_gb_r2}")


Fitting 5 folds for each of 162 candidates, totalling 810 fits
MAE: 3.935477685692727
R2: 0.8004287033391411


# Регрессия (самостоятельная имплементация)

In [21]:
# Самостоятельная реализация градиентного бустинга
class GradientBoostingRegressorCustom:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.init_prediction = None

    def fit(self, X, y):
        self.init_prediction = np.mean(y)
        residual = y - self.init_prediction

        for _ in range(self.n_estimators):
            model = DecisionTreeRegressor(max_depth=self.max_depth)
            model.fit(X, residual)
            self.models.append(model)

            predictions = model.predict(X)
            residual -= self.learning_rate * predictions

    def predict(self, X):
        prediction = np.full(X.shape[0], self.init_prediction)
        for model in self.models:
            prediction += self.learning_rate * model.predict(X)
        return prediction

In [22]:
# Обучение модели
custom_gb_model = GradientBoostingRegressorCustom(n_estimators=100, learning_rate=0.1, max_depth=3)
custom_gb_model.fit(X_train.values, y_train.values)

y_pred_custom_gb = custom_gb_model.predict(X_test.values)

custom_gb_mae = mean_absolute_error(y_test, y_pred_custom_gb)
custom_gb_r2 = r2_score(y_test, y_pred_custom_gb)

print(f"MAE: {custom_gb_mae}")
print(f"R2: {custom_gb_r2}")

MAE: 4.072886304831286
R2: 0.7819252789346214


In [23]:
# Улучшенная реализация градиентного бустинга
class GradientBoostingRegressorImproved:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.models = []
        self.init_prediction = None

    def fit(self, X, y):
        self.init_prediction = np.mean(y)
        residual = y - self.init_prediction

        for _ in range(self.n_estimators):
            model = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf
            )
            model.fit(X, residual)
            self.models.append(model)

            predictions = model.predict(X)
            residual -= self.learning_rate * predictions

    def predict(self, X):
        prediction = np.full(X.shape[0], self.init_prediction)
        for model in self.models:
            prediction += self.learning_rate * model.predict(X)
        return prediction

In [24]:
improved_gb_model = GradientBoostingRegressorImproved(
    n_estimators=150, learning_rate=0.05, max_depth=4, min_samples_split=5, min_samples_leaf=2
)
improved_gb_model.fit(X_train.values, y_train.values)

y_pred_improved_gb = improved_gb_model.predict(X_test.values)

improved_gb_mae = mean_absolute_error(y_test, y_pred_improved_gb)
improved_gb_r2 = r2_score(y_test, y_pred_improved_gb)

print(f"MAE: {improved_gb_mae}")
print(f"R2: {improved_gb_r2}")

MAE: 3.935219334924898
R2: 0.7917778988478528
