## Регрессия

### Предобработка данных

In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('anime.csv')

data = data[~data.applymap(lambda x: x == 'UNKNOWN').any(axis=1)]

y = data['Score']

encoder = OneHotEncoder(sparse_output=False)
x_types = encoder.fit_transform(data[['Type']])
x_types = pd.DataFrame(x_types, columns=encoder.get_feature_names_out(['Type']))

x_episodes = pd.DataFrame(data['Episodes']).reset_index(drop=True)

# encoder = OneHotEncoder(sparse_output=False)
x_source = encoder.fit_transform(data[['Source']])
x_source = pd.DataFrame(x_source, columns=encoder.get_feature_names_out(['Source']))

# encoder = OneHotEncoder(sparse_output=False)
x_rating = encoder.fit_transform(data[['Rating']])
x_rating = pd.DataFrame(x_rating, columns=encoder.get_feature_names_out(['Rating']))

mlb = MultiLabelBinarizer()
x_producers = data['Producers'].str.split(', ')
x_producers = mlb.fit_transform(x_producers)
x_producers = pd.DataFrame(x_producers, columns=mlb.classes_)

x_licensors = data['Licensors'].str.split(', ')
x_licensors = mlb.fit_transform(x_licensors)
x_licensors = pd.DataFrame(x_licensors, columns=mlb.classes_)

x_studios = data['Studios'].str.split(', ')
x_studios = mlb.fit_transform(x_studios)
x_studios = pd.DataFrame(x_studios, columns=mlb.classes_)

x_premiered = data['Premiered'].str.split(' ')
x_season = pd.DataFrame(x_premiered.apply(lambda x: x[0]).tolist(), columns=['Season'])
x_year = pd.DataFrame(x_premiered.apply(lambda x: x[1]).tolist(), columns=['Year'])
x_year['Year'] = pd.to_numeric(x_year['Year'])

# encoder = OneHotEncoder(sparse_output=False)
x_season = encoder.fit_transform(x_season)
x_season = pd.DataFrame(x_season, columns=encoder.get_feature_names_out(['Season']))

vectorizer = TfidfVectorizer()
x_synopsis = vectorizer.fit_transform(data['Synopsis'])
x_synopsis = pd.DataFrame(x_synopsis.toarray(), columns=vectorizer.get_feature_names_out())

# X = pd.concat([x_types, x_episodes, x_rating, x_season, x_year, x_studios, x_licensors, x_producers, x_synopsis], axis=1)
X = pd.concat([x_types, x_episodes, x_rating, x_season, x_year], axis=1)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# print(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  data = data[~data.applymap(lambda x: x == 'UNKNOWN').any(axis=1)]


### Применение алгоритмов при помощи sklearn

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error

import numpy as np

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=float)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=float)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

model = KNeighborsRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')


MAE: 0.5506963724475853
MAE: 0.565099730458221
MAE: 0.6628440942854689
MAE: 0.5818611820596566
MAE: 0.5277646359381031


### Самостоятельная имплементация линейной регрессии

In [12]:
import numpy as np

class LinearRegressionSGD:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None

    def fit(self, X, y):
        m, n = X.shape
        self.weights = np.zeros(n + 1)
        X_b = np.c_[np.ones((m, 1)), X]

        for epoch in range(self.epochs):
            for i in range(m):
                random_index = np.random.randint(m)
                xi = X_b[random_index:random_index+1]
                yi = y[random_index]
                gradient = 2 * xi.T.dot(xi.dot(self.weights) - yi)
                self.weights -= self.learning_rate * gradient

    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        return X_b.dot(self.weights)

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=float)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=float)

model = LinearRegressionSGD(learning_rate=0.01, epochs=1000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("MAE:", mae)

MAE: 0.5991580775966912


### Самостоятельная имплементация KNN

In [13]:
import numpy as np

class KNearestNeighbors:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        X = np.array(X)
        predictions = []
        for x in X:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            nearest_indices = np.argsort(distances)[:self.n_neighbors]
            nearest_labels = self.y_train[nearest_indices]
            predictions.append(np.mean(nearest_labels))
        return np.array(predictions)

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=float)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=float)

model = KNearestNeighbors(n_neighbors=5)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("MAE:", mae)

MAE: 0.5738544474393532


### Самостоятельная имплементация решающего дерева

In [14]:
import numpy as np

class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(set(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return self.Node(value=np.mean(y))

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self.Node(value=np.mean(y))

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_loss = float('inf')

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                left_mean = np.mean(y[left_indices])
                right_mean = np.mean(y[right_indices])

                loss = (np.sum((y[left_indices] - left_mean) ** 2) +
                        np.sum((y[right_indices] - right_mean) ** 2))

                if loss < best_loss:
                    best_loss = loss
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=float)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=float)

model = DecisionTreeRegressor(max_depth=5)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("MAE:", mae)


MAE: 0.5518996065967602


### Самостоятельная имплементация случайного леса

In [15]:
import numpy as np

class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(set(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return self.Node(value=np.mean(y))

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self.Node(value=np.mean(y))

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_loss = float('inf')

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                left_mean = np.mean(y[left_indices])
                right_mean = np.mean(y[right_indices])

                loss = (np.sum((y[left_indices] - left_mean) ** 2) +
                        np.sum((y[right_indices] - right_mean) ** 2))

                if loss < best_loss:
                    best_loss = loss
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

class RandomForestRegressor:
    def __init__(self, n_estimators=10, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        for _ in range(self.n_estimators):
            indices = np.random.choice(len(X), len(X), replace=True)
            X_sample = X[indices]
            y_sample = y[indices]
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(predictions, axis=0)

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=float)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=float)

model = RandomForestRegressor(n_estimators=10, max_depth=5)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("MAE:", mae)


MAE: 0.534441994674516


### Самостоятельная имплементация градиентного бустинга

In [18]:
import numpy as np

class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(set(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return self.Node(value=np.mean(y))

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self.Node(value=np.mean(y))

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_loss = float('inf')

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                left_mean = np.mean(y[left_indices])
                right_mean = np.mean(y[right_indices])

                loss = (np.sum((y[left_indices] - left_mean) ** 2) +
                        np.sum((y[right_indices] - right_mean) ** 2))

                if loss < best_loss:
                    best_loss = loss
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

class GradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        predictions = np.zeros_like(y)

        for _ in range(self.n_estimators):
            residuals = y - predictions
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            self.trees.append(tree)
            predictions += self.learning_rate * tree.predict(X)

    def predict(self, X):
        X = np.array(X)
        predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return predictions

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=float)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=float)

model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("MAE:", mae)

MAE: 0.5309886123446206


## Классификация

### Разведывательный анализ данных

In [None]:
print(data['type'].value_counts())

type
INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: count, dtype: int64


### Предобработка данных

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np

data = pd.read_csv('mbti.csv')[:1000]
X, y = data['posts'], data['type']

class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)

vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Реализация при помощи sklearn

In [None]:
model = LogisticRegression(class_weight=dict(zip(np.unique(y), class_weights)))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"LogisticRegression f1_score: {f1_score(y_test, y_pred, average='weighted')}")

# model = KNeighborsClassifier()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(f"KNeighborsClassifier f1_score: {f1_score(y_test, y_pred, average='weighted')}")

# model = DecisionTreeClassifier(class_weight=dict(zip(np.unique(y), class_weights)))
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(f"DecisionTreeClassifier f1_score: {f1_score(y_test, y_pred, average='weighted')}")

# model = RandomForestClassifier(class_weight=dict(zip(np.unique(y), class_weights)))
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(f"RandomForestClassifier f1_score: {f1_score(y_test, y_pred, average='weighted')}")

# model = GradientBoostingClassifier(n_estimators=5)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(f"GradientBoostingClassifier f1_score: {f1_score(y_test, y_pred, average='weighted')}")

LogisticRegression f1_score: 0.48435489613633786


### Самостоятельная имплементация логистической регрессии

In [None]:
import numpy as np

from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

if issparse(X_train):
    X_train = X_train.toarray()
if issparse(X_test):
    X_test = X_test.toarray()

X_train = np.array(X_train, dtype=float)
X_test = np.array(X_test, dtype=float)

class SoftmaxRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Численная стабильность
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        m, n = X.shape
        num_classes = len(np.unique(y))
        self.weights = np.zeros((n + 1, num_classes))
        X_b = np.c_[np.ones((m, 1)), X]

        # One-hot encoding целевых меток
        y_one_hot = np.zeros((m, num_classes))
        y_one_hot[np.arange(m), y] = 1

        for epoch in range(self.epochs):
            logits = X_b.dot(self.weights)
            probabilities = self.softmax(logits)
            gradient = X_b.T.dot(probabilities - y_one_hot) / m
            self.weights -= self.learning_rate * gradient

    def predict(self, X):
        X = np.array(X)
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        logits = X_b.dot(self.weights)
        probabilities = self.softmax(logits)
        return np.argmax(probabilities, axis=1)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

# Данные
X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=int)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=int)

model = SoftmaxRegression(learning_rate=0.01, epochs=1000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)




Accuracy: 0.26


### Самостоятельная имплементация KNN

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import issparse

class KNearestNeighbors:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        X = np.array(X)
        predictions = []
        for x in X:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            nearest_indices = np.argsort(distances)[:self.n_neighbors]
            nearest_labels = self.y_train[nearest_indices]
            # Используем мажоритарное голосование
            unique, counts = np.unique(nearest_labels, return_counts=True)
            predictions.append(unique[np.argmax(counts)])
        return np.array(predictions)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

# Кодирование текстовых меток
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Преобразование разреженных матриц в плотные
if issparse(X_train):
    X_train = X_train.toarray()
if issparse(X_test):
    X_test = X_test.toarray()

X_train = np.array(X_train, dtype=float)
X_test = np.array(X_test, dtype=float)

model = KNearestNeighbors(n_neighbors=5)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.295


### Самостоятельная имплементация дерева решений

In [None]:
import numpy as np

from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

if issparse(X_train):
    X_train = X_train.toarray()
if issparse(X_test):
    X_test = X_test.toarray()

X_train = np.array(X_train, dtype=float)
X_test = np.array(X_test, dtype=float)

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return self.Node(value=self._majority_vote(y))

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self.Node(value=self._majority_vote(y))

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_gini = float('inf')

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                gini = self._gini_impurity(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _gini_impurity(self, left, right):
        def gini(labels):
            _, counts = np.unique(labels, return_counts=True)
            probs = counts / len(labels)
            return 1 - np.sum(probs ** 2)

        m = len(left) + len(right)
        gini_left = gini(left)
        gini_right = gini(right)
        return (len(left) / m) * gini_left + (len(right) / m) * gini_right

    def _majority_vote(self, y):
        unique, counts = np.unique(y, return_counts=True)
        return unique[np.argmax(counts)]

    def predict(self, X):
        X = np.array(X)
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=int)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=int)

model = DecisionTreeClassifier(max_depth=5)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.41


### Самостоятельная имплементация случайного леса

In [None]:
import numpy as np

from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

if issparse(X_train):
    X_train = X_train.toarray()
if issparse(X_test):
    X_test = X_test.toarray()

X_train = np.array(X_train, dtype=float)
X_test = np.array(X_test, dtype=float)

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return self.Node(value=self._majority_vote(y))

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self.Node(value=self._majority_vote(y))

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_gini = float('inf')

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                gini = self._gini_impurity(y[left_indices], y[right_indices])

                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _gini_impurity(self, left, right):
        def gini(labels):
            _, counts = np.unique(labels, return_counts=True)
            probs = counts / len(labels)
            return 1 - np.sum(probs ** 2)

        m = len(left) + len(right)
        gini_left = gini(left)
        gini_right = gini(right)
        return (len(left) / m) * gini_left + (len(right) / m) * gini_right

    def _majority_vote(self, y):
        unique, counts = np.unique(y, return_counts=True)
        return unique[np.argmax(counts)]

    def predict(self, X):
        X = np.array(X)
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

class RandomForestClassifier:
    def __init__(self, n_estimators=10, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        for _ in range(self.n_estimators):
            indices = np.random.choice(len(X), len(X), replace=True)
            X_sample = X[indices]
            y_sample = y[indices]
            tree = DecisionTreeClassifier(max_depth=self.max_depth)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        X = np.array(X)
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=int)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=int)

model = RandomForestClassifier(n_estimators=1, max_depth=1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.235


### Самостоятельная имплементация градиентного бустинга

In [None]:
import numpy as np

from scipy.sparse import issparse
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

if issparse(X_train):
    X_train = X_train.toarray()
if issparse(X_test):
    X_test = X_test.toarray()

X_train = np.array(X_train, dtype=float)
X_test = np.array(X_test, dtype=float)

class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return self.Node(value=np.mean(y))

        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return self.Node(value=np.mean(y))

        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return self.Node(feature=best_feature, threshold=best_threshold, left=left, right=right)

    def _find_best_split(self, X, y):
        best_feature, best_threshold = None, None
        best_loss = float('inf')

        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] <= threshold
                right_indices = X[:, feature] > threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                left_mean = np.mean(y[left_indices])
                right_mean = np.mean(y[right_indices])

                loss = (np.sum((y[left_indices] - left_mean) ** 2) +
                        np.sum((y[right_indices] - right_mean) ** 2))

                if loss < best_loss:
                    best_loss = loss
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

class GradientBoostingClassifier:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.init_prediction = None

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.init_prediction = np.mean(y)
        predictions = np.full_like(y, self.init_prediction, dtype=float)

        for _ in range(self.n_estimators):
            residuals = y - predictions
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            self.trees.append(tree)
            predictions += self.learning_rate * tree.predict(X)

    def predict(self, X):
        X = np.array(X)
        predictions = np.full(X.shape[0], self.init_prediction, dtype=float)
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return (predictions >= 0.5).astype(int)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred)

X_train = np.array(X_train, dtype=float)
y_train = np.array(y_train, dtype=int)
X_test = np.array(X_test, dtype=float)
y_test = np.array(y_test, dtype=int)

model = GradientBoostingClassifier(n_estimators=1, learning_rate=0.1, max_depth=1)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.09
