In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [2]:
# Генерируем случайные данные
X, y = make_regression(n_samples=100, n_features=5, noise=0.1, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Дерево

In [3]:
# Дерево из первой домашки

class MyDecisionTree:
    def __init__(self, max_depth=None, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        gini_impurity = 1 - np.sum(probabilities ** 2)
        return gini_impurity

    def split(self, X, y, feature_index, threshold):
        mask = X[:, feature_index] <= threshold
        return X[mask], y[mask], X[~mask], y[~mask]

    def find_best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        current_gini = self.gini(y)
        best_gini = 1
        best_feature = None
        best_threshold = None

        for feature_index in range(n):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = self.split(X, y, feature_index, threshold)

                if len(y_left) < self.min_samples_leaf or len(y_right) < self.min_samples_leaf:
                    continue

                gini_left = self.gini(y_left)
                gini_right = self.gini(y_right)

                weighted_gini = (len(y_left) / m) * gini_left + (len(y_right) / m) * gini_right

                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def build_tree(self, X, y, depth):
        if depth == 0 or len(np.unique(y)) == 1:
            return np.mean(y)

        best_feature, best_threshold = self.find_best_split(X, y)

        if best_feature is None:
            return np.mean(y)

        X_left, y_left, X_right, y_right = self.split(X, y, best_feature, best_threshold)

        left_subtree = self.build_tree(X_left, y_left, depth - 1)
        right_subtree = self.build_tree(X_right, y_right, depth - 1)

        return (best_feature, best_threshold, left_subtree, right_subtree)

    def fit(self, X, y):
        self.tree = self.build_tree(X, y.astype(int), self.max_depth)

    def predict_instance(self, x, node):
        if np.isscalar(node):
            return node  # leaf node

        feature, threshold, left_subtree, right_subtree = node
        if x[feature] <= threshold:
            return self.predict_instance(x, left_subtree)
        else:
            return self.predict_instance(x, right_subtree)

    def predict(self, X):
        return np.array([self.predict_instance(x, self.tree) for x in X])

### Градиентный бустинг

In [4]:
class MyGradientBoosting:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=None, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.trees = []

    def fit(self, X, y):
        predictions = np.zeros_like(y, dtype=np.float64)

        for _ in range(self.n_estimators):
            residuals = y - predictions
            
            tree = MyDecisionTree(max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf)
            tree.fit(X, residuals)
            
            tree_pred = tree.predict(X)
            predictions += self.learning_rate * tree_pred

            self.trees.append(tree)

    def predict(self, X):
        predictions = np.zeros(len(X))

        for tree in self.trees:
            tree_pred = tree.predict(X)
            predictions += self.learning_rate * tree_pred

        return predictions

In [5]:
# Обучаем градиентный бустинг
gb = MyGradientBoosting(n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_leaf=5)
gb.fit(X_train, y_train)

In [6]:
# Предиктим
y_pred = gb.predict(X_test)

In [7]:
# Оцениваем качество модели
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error: {mae}")

Mean Squared Error: 2282.4360685281927
R^2 Score: 0.8863451236457112
Mean Absolute Error: 33.883434156300666
