# Классификация

Получение данных

In [None]:
import kagglehub
import os
import pandas as pd

path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

csv_path = os.path.join(path, "creditcard.csv")

df = pd.read_csv(csv_path)
df.head()

features = df[['Time', 'Amount']]

Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:00<00:00, 98.2MB/s]

Extracting files...





In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score


# Подготовка данных
X = df.drop('Class', axis=1)
y = df['Class']

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание и обучение модели с оптимизированными параметрами
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    subsample=0.8,
    n_iter_no_change=10  # ранняя остановка
)

gb_model.fit(X_train_scaled, y_train)

# Получение предсказаний
y_pred = gb_model.predict(X_test_scaled)
y_pred_proba = gb_model.predict_proba(X_test_scaled)[:, 1]

# Вычисление метрик
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)
precision_score_val = precision_score(y_test, y_pred)
recall_score_val = recall_score(y_test, y_pred)

# Вывод метрик
print(f"AUPRC : {auprc:.4f}")
print(f"Precision : {precision_score_val:.4f}")
print(f"Recall : {recall_score_val:.4f}")


AUPRC : 0.4075
Precision : 0.9048
Recall : 0.3878


AUPRC: 0.4075
Значение AUPRC показывает среднюю точность модели при различных уровнях полноты. Учитывая сильный дисбаланс классов в данных, это неплохой результат, хотя есть потенциал для улучшения.

Precision: 0.9048
Очень хороший показатель точности - 90.48% предсказанных мошеннических операций действительно являются мошенническими

Recall: 0.3878
Показатель полноты говорит о том, что модель обнаруживает только 38.78% всех фактических мошеннических операций. Это относительно низкий показатель, который желательно улучшить.

Модель демонстрирует высокую точность в определении мошенничества, но при этом пропускает значительную часть мошеннических операций.

Гипотеза: если добавим методы сэмплирования (smote), то модель значительна улучшится.

Применим

In [None]:

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    subsample=0.8,
    n_iter_no_change=5
)

gb_model.fit(X_train_smote, y_train_smote)

y_pred = gb_model.predict(X_test_scaled)
y_pred_proba = gb_model.predict_proba(X_test_scaled)[:, 1]

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)
precision_score_val = precision_score(y_test, y_pred)
recall_score_val = recall_score(y_test, y_pred)

print(f"AUPRC Score: {auprc:.4f}")
print(f"Precision Score: {precision_score_val:.4f}")
print(f"Recall Score: {recall_score_val:.4f}")






AUPRC Score: 0.7437
Precision Score: 0.1141
Recall Score: 0.9184



AUPRC (0.7437) демонстрирует приемлемую общую эффективность модели. Высокий показатель Recall (0.9184) указывает на отличное обнаружение положительных случаев. Однако низкий Precision (0.1141) свидетельствует о значительном количестве ложноположительных результатов. Модель склонна к переобнаружению мошеннических транзакций.

In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score

class MyGradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.05, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_value = None

    def fit(self, features, target):
        self.initial_value = np.mean(target)
        residual = target - self.initial_value

        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(features, residual)
            predictions = tree.predict(features)
            residual -= self.learning_rate * predictions
            self.trees.append(tree)

    def predict(self, features):
        predictions = np.full(features.shape[0], self.initial_value)
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(features)
        return predictions

    def predict_proba(self, features):
        # Преобразование регрессионных предсказаний в вероятности
        raw_predictions = self.predict(features)
        probabilities = 1 / (1 + np.exp(-raw_predictions))
        return np.vstack((1 - probabilities, probabilities)).T

# Подготовка данных
X = df.drop('Class', axis=1)
y = df['Class']

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание и обучение модели
gb_model = MyGradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)

gb_model.fit(X_train_scaled, y_train)

# Получение предсказаний
y_pred_proba = gb_model.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

# Вычисление метрик
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)
precision_score_val = precision_score(y_test, y_pred)
recall_score_val = recall_score(y_test, y_pred)

# Вывод метрик
print(f"AUPRC : {auprc:.4f}")
print(f"Precision : {precision_score_val:.4f}")
print(f"Recall : {recall_score_val:.4f}")


AUPRC : 0.8492
Precision : 0.0017
Recall : 1.0000


Попытаемся улучшить метрики с приминением гиперпараметров

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score
from sklearn.base import BaseEstimator, RegressorMixin

class OptimizedGradientBoostingRegressor(BaseEstimator, RegressorMixin):
    def init(self, n_estimators=100, learning_rate=0.05, max_depth=3, min_samples_split=2,
                 min_samples_leaf=1, subsample=1.0, random_state=None):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.subsample = subsample
        self.random_state = random_state
        self.trees = []
        self.feature_importances_ = None

    def _subsample(self, X, y):
        if self.subsample == 1.0:
            return X, y
        n_samples = int(X.shape[0] * self.subsample)
        indices = np.random.choice(X.shape[0], n_samples, replace=False)
        return X[indices], y[indices]

    def fit(self, X, y):
        np.random.seed(self.random_state)
        self.initial_prediction = np.mean(y)
        current_predictions = np.full_like(y, self.initial_prediction, dtype=np.float64)

        # Инициализация массива важности признаков
        self.feature_importances_ = np.zeros(X.shape[1])

        for i in range(self.n_estimators):
            residuals = y - current_predictions

            # Подвыборка данных
            X_subset, residuals_subset = self._subsample(X, residuals)

            # Создание и обучение дерева
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=self.random_state
            )

            tree.fit(X_subset, residuals_subset)

            # Обновление важности признаков
            self.feature_importances_ += tree.feature_importances_

            # Обновление предсказаний
            current_predictions += self.learning_rate * tree.predict(X)
            self.trees.append(tree)

        # Нормализация важности признаков
        self.feature_importances_ /= self.n_estimators
        return self

    def predict(self, X):
        predictions = np.full(X.shape[0], self.initial_prediction)
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return predictions

    def predict_proba(self, X):
        raw_predictions = self.predict(X)
        probabilities = 1 / (1 + np.exp(-raw_predictions))
        return np.vstack((1 - probabilities, probabilities)).T

# Функция для поиска оптимальных гиперпараметров
def find_best_params(X_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'subsample': [0.8, 1.0]
    }

    model = OptimizedGradientBoostingRegressor()
    grid_search = GridSearchCV(
        model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_

# Использование модели
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Поиск лучших параметров
best_params = find_best_params(X_train_scaled, y_train)

# Обучение модели с лучшими параметрами
optimized_model = OptimizedGradientBoostingRegressor(**best_params, random_state=42)
optimized_model.fit(X_train_scaled, y_train)

# Получение предсказаний
y_pred_proba = optimized_model.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_pred_proba > 0.5).astype(int)

# Вычисление и вывод метрик
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)
precision_score_val = precision_score(y_test, y_pred)
recall_score_val = recall_score(y_test, y_pred)

print(f"AUPRC: {auprc:.4f}")
print(f"Precision: {precision_score_val:.4f}")
print(f"Recall: {recall_score_val:.4f}")

## Регрессия

In [4]:
import kagglehub
import os

path = kagglehub.dataset_download("rohitsahoo/sales-forecasting")

csv_path = os.path.join(path, "train.csv")

df = pd.read_csv(csv_path)
df.head()

Downloading from https://www.kaggle.com/api/v1/datasets/download/rohitsahoo/sales-forecasting?dataset_version_number=2...


100%|██████████| 480k/480k [00:00<00:00, 17.8MB/s]

Extracting files...





Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,1,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,12/06/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368


In [7]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Создаем копию датафрейма
df = df.copy()

# Заполняем пропуски в Postal Code
df.loc[:, 'Postal Code'] = df['Postal Code'].fillna(df['Postal Code'].median())

# Преобразуем даты с указанием формата и обработкой ошибок
try:
    df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d/%m/%Y')
except:
    df['Order Date'] = pd.to_datetime(df['Order Date'])

try:
    df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%d/%m/%Y')
except:
    df['Ship Date'] = pd.to_datetime(df['Ship Date'])

# Создаем новые признаки
df['Ship Days'] = (df['Ship Date'] - df['Order Date']).dt.total_seconds() / (24 * 60 * 60)
df['Order Year'] = df['Order Date'].dt.year
df['Order Month'] = df['Order Date'].dt.month

# Кодируем категориальные признаки
categorical_columns = ['Ship Mode', 'Segment', 'Country', 'Region', 'Category', 'Sub-Category']
le = LabelEncoder()
for col in categorical_columns:
    df.loc[:, col + '_Encoded'] = le.fit_transform(df[col])

# Выбираем признаки для модели
features = ['Postal Code', 'Order Year', 'Order Month', 'Ship Days'] + \
          [col + '_Encoded' for col in categorical_columns]

# Подготовка данных для модели
X = df[features]
y = df['Sales']

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаем и обучаем модель
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Делаем предсказания
y_pred = model.predict(X_test)

# Вычисляем метрики
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Выводим метрики
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.4f}')



MAE: 254.51
MSE: 647872.62
RMSE: 804.91
R²: 0.0307

Проверка преобразования дат:
  Order Date  Ship Date  Ship Days
0 2017-11-08 2017-11-11        3.0
1 2017-11-08 2017-11-11        3.0
2 2017-06-12 2017-06-16        4.0
3 2016-10-11 2016-10-18        7.0
4 2016-10-11 2016-10-18        7.0


Данные метрик неплохие, но попробуем подобрать оптимальные гиперпараметры

In [8]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Заполняем пропуски в Postal Code
df['Postal Code'].fillna(df['Postal Code'].median(), inplace=True)

# Преобразуем даты
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])
df['Ship Days'] = (df['Ship Date'] - df['Order Date']).dt.days
df['Order Year'] = df['Order Date'].dt.year
df['Order Month'] = df['Order Date'].dt.month

# Кодируем категориальные признаки
categorical_columns = ['Ship Mode', 'Segment', 'Country', 'Region', 'Category', 'Sub-Category']
le = LabelEncoder()
for col in categorical_columns:
    df[col + '_Encoded'] = le.fit_transform(df[col])

# Выбираем признаки для модели
features = ['Postal Code', 'Order Year', 'Order Month', 'Ship Days'] + \
          [col + '_Encoded' for col in categorical_columns]

# Подготовка данных для модели
X = df[features]
y = df['Sales']

# Разделяем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определяем параметры для оптимизации
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0]
}

# Создаем базовую модель
base_model = GradientBoostingRegressor(random_state=42)

# Выполняем поиск по сетке с перекрестной проверкой
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    verbose=1
)

# Обучаем модель с поиском оптимальных параметров
grid_search.fit(X_train, y_train)

# Выводим лучшие параметры
print("\nЛучшие параметры:")
print(grid_search.best_params_)

# Получаем лучшую модель
best_model = grid_search.best_estimator_

# Делаем предсказания с использованием лучшей модели
y_pred = best_model.predict(X_test)

# Вычисляем метрики
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Выводим метрики
print("\nМетрики качества модели:")
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R²: {r2:.4f}')



Fitting 5 folds for each of 216 candidates, totalling 1080 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Postal Code'].fillna(df['Postal Code'].median(), inplace=True)



Лучшие параметры:
{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50, 'subsample': 1.0}

Метрики качества модели:
MAE: 242.86
MSE: 561605.74
RMSE: 749.40
R²: 0.1598
