# ЛР3 (Решающее дерево)

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")

# Предобработка данных для задачи классификации

In [27]:
data_classification = pd.read_csv(r'C:\Users\xwxsz\Desktop\Video_games_esrb_rating.csv')

label_encoder = LabelEncoder()
data_classification['title'] = label_encoder.fit_transform(data_classification['title'])
data_classification['esrb_rating'] = label_encoder.fit_transform(data_classification['esrb_rating'])

X = data_classification.drop('esrb_rating', axis=1) 
y = data_classification['esrb_rating'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Классификация (бейзлайн)

In [28]:
# Создание и обучение модели
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Предикт
y_pred = clf.predict(X_test)

# Оценка
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # average='weighted' учитывает дисбаланс классов

print(f'Accuracy: {accuracy}')
print(f'F1: {f1}')

Accuracy: 0.8179419525065963
F1: 0.8179669477129313


# Классификация (улучшенный бейзлайн)

In [29]:
# Определение гиперпараметров для подбора
param_grid = {
    'max_depth': [3, 5, 10, 15, None],  # максимальная глубина дерева
    'min_samples_split': [2, 5, 10],  # минимальное количество образцов для разделения
    'min_samples_leaf': [1, 2, 4],  # минимальное количество образцов в листьях
    'criterion': ['gini', 'entropy'],  # критерий разделения
    'max_features': [None, 'sqrt', 'log2']  # количество признаков для разделения
}

In [30]:
# Настройка StratifiedKFold для стратифицированной кросс-валидации
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Настройка и выполнение GridSearchCV с использованием StratifiedKFold
grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=cv,  # Используем стратифицированную кросс-валидацию
    n_jobs=-1,
    verbose=0
)

In [31]:
# Обучение модели с GridSearchCV
grid_search.fit(X, y)
print(f'{grid_search.best_params_}')

# Оценка на данных с лучшими параметрами
best_clf = grid_search.best_estimator_

{'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [32]:
# Разделение данных на тестовую и обучающую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение 
best_clf.fit(X_train, y_train)

In [33]:
# Прндикт
y_pred_best = best_clf.predict(X_test)

# Оценка
accuracy_best = accuracy_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best, average='weighted')

print(f'Accuracy (Improved): {accuracy_best}')
print(f'F1 Score (Improved, weighted): {f1_best}')

Accuracy (Improved): 0.8284960422163589
F1 Score (Improved, weighted): 0.8295677824673502


# Классификация (самостоятельная имплементация)

In [34]:
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        data = np.hstack((X, y.to_numpy().reshape(-1, 1)))  # Преобразуем Series в NumPy массив
        self.tree = self._build_tree(data)

    def predict(self, X):
        return np.array([self._predict_row(x, self.tree) for x in X])

    def _gini(self, groups, classes):
        n_instances = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0.0
            for class_val in classes:
                proportion = (group[:, -1] == class_val).sum() / size
                score += proportion ** 2
            gini += (1.0 - score) * (size / n_instances)
        return gini

    def _split(self, index, value, data):
        left = data[data[:, index] <= value]
        right = data[data[:, index] > value]
        return left, right

    def _get_split(self, data):
        classes = np.unique(data[:, -1])
        best_index, best_value, best_score, best_groups = None, None, float("inf"), None
        for index in range(data.shape[1] - 1):
            for row in data:
                groups = self._split(index, row[index], data)
                gini = self._gini(groups, classes)
                if gini < best_score:
                    best_index, best_value, best_score, best_groups = index, row[index], gini, groups
        return {'index': best_index, 'value': best_value, 'groups': best_groups}

    def _to_terminal(self, group):
        outcomes = group[:, -1]
        return np.bincount(outcomes.astype(int)).argmax()

    def _split_node(self, node, depth):
        left, right = node['groups']
        del node['groups']

        if len(left) == 0 or len(right) == 0:
            node['left'] = node['right'] = self._to_terminal(np.vstack((left, right)))
            return

        if self.max_depth and depth >= self.max_depth:
            node['left'], node['right'] = self._to_terminal(left), self._to_terminal(right)
            return

        if len(left) >= self.min_samples_split:
            node['left'] = self._get_split(left)
            self._split_node(node['left'], depth + 1)
        else:
            node['left'] = self._to_terminal(left)

        if len(right) >= self.min_samples_split:
            node['right'] = self._get_split(right)
            self._split_node(node['right'], depth + 1)
        else:
            node['right'] = self._to_terminal(right)

    def _build_tree(self, data):
        root = self._get_split(data)
        self._split_node(root, 1)
        return root

    def _predict_row(self, row, node):
        if row[node['index']] <= node['value']:
            if isinstance(node['left'], dict):
                return self._predict_row(row, node['left'])
            else:
                return node['left']
        else:
            if isinstance(node['right'], dict):
                return self._predict_row(row, node['right'])
            else:
                return node['right']

In [35]:
# Разделение данных
X = data_classification.drop('esrb_rating', axis=1).values
y = data_classification['esrb_rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Обучение модели
tree = DecisionTree(max_depth=5, min_samples_split=10)
tree.fit(X_train, y_train)

# Предикт
y_pred = tree.predict(X_test)

# Оценка 
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'F1 Score (weighted): {f1:.4f}')

Accuracy: 0.7678
F1 Score (weighted): 0.7683


# Предобработка данных для задачи регресии 

In [36]:
data_regression = pd.read_excel(r'C:\Users\xwxsz\Desktop\Real estate valuation data set.xlsx')
X = data_regression.drop(columns=['Y house price of unit area'])
y = data_regression['Y house price of unit area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Регрессия (бейзлайн)

In [37]:
# Базовый регрессор
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Предикт
y_pred = regressor.predict(X_test)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE (Baseline): {mae}")
print(f"R^2 Score (Baseline): {r2}")

MAE (Baseline): 5.360240963855421
R^2 Score (Baseline): 0.6787466883184802


# Регрессия (улучшенный бейзлайн)

In [38]:
# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Настройка гиперпараметров
param_grid = {
    'max_depth': range(3, 10),
    'min_samples_split': range(5, 21, 5),
}
regressor = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Оптимальная модель
best_regressor = grid_search.best_estimator_
print(f"{grid_search.best_params_}")

# Предикт
y_pred = best_regressor.predict(X_test_scaled)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE (Improved): {mae}")
print(f"R^2 Score (Improved): {r2}")

{'max_depth': 3, 'min_samples_split': 20}
MAE (Improved): 4.719388106677952
R^2 Score (Improved): 0.7512372868038653


# Регрессия (самостоятельная имплементация)

In [39]:
class DecisionTreeRegressorCustom:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def _mae(self, groups, targets):
        mae = 0.0
        for group in groups:
            if len(group) == 0:
                continue
            group_targets = group[:, -1]
            mean_target = np.mean(group_targets)
            mae += np.sum(np.abs(group_targets - mean_target))
        return mae

    def _split(self, index, value, data):
        left = data[data[:, index] <= value]
        right = data[data[:, index] > value]
        return left, right

    def _get_split(self, data):
        best_index, best_value, best_score, best_groups = None, None, float("inf"), None
        for index in range(data.shape[1] - 1):
            for value in np.unique(data[:, index]):
                groups = self._split(index, value, data)
                mae = self._mae(groups, data[:, -1])
                if mae < best_score:
                    best_index, best_value, best_score, best_groups = index, value, mae, groups
        return {'index': best_index, 'value': best_value, 'groups': best_groups}

    def _build_tree(self, data, depth=0):
        X, y = data[:, :-1], data[:, -1]
        if len(np.unique(y)) == 1 or len(data) < self.min_samples_split or (self.max_depth and depth >= self.max_depth):
            return np.mean(y)
        split = self._get_split(data)
        left = self._build_tree(split['groups'][0], depth + 1)
        right = self._build_tree(split['groups'][1], depth + 1)
        return {'index': split['index'], 'value': split['value'], 'left': left, 'right': right}

    def fit(self, X, y):
        data = np.hstack((X, y.reshape(-1, 1)))
        self.tree = self._build_tree(data)

    def _predict_row(self, row, node):
        if not isinstance(node, dict):
            return node
        if row[node['index']] <= node['value']:
            return self._predict_row(row, node['left'])
        return self._predict_row(row, node['right'])

    def predict(self, X):
        return np.array([self._predict_row(row, self.tree) for row in X])

In [40]:
# Обучение модели
tree = DecisionTreeRegressorCustom(max_depth=5, min_samples_split=10)
tree.fit(X_train.values, y_train.values)

# Предикт
y_pred = tree.predict(X_test.values)

# Оценка 
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"R^2: {r2}")

MAE: 4.597153184399963
R^2: 0.7472048899118222
