# ML Research 
Автор: Трефилов Яков Николаевич, R4197

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-2.13.0-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp310-cp310-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m107.3 MB/s[0m  [33m0:00:00[0mm0:00:01[0m00:01[0m
[?25hDownloading graphviz-0.21-py3-none-any.whl (47 kB)
Downloading plotly-6.5.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m83.8 MB/s[0m  [33m0:00:00[0m
[?25hDownloading narwhals-2.13.0-py3-none-any.whl (426 kB)
Installing collected packages: narwhals, graphviz, plotly, catboost
[2K   [90m━━━━━━━━━━━━

In [2]:
import pandas as pd # Для загрузки и предобработки данных
import numpy as np # За компанию

import matplotlib.pyplot as plt # Для красивых визуализаций
import seaborn as sns # Для очень красивых визуализаций

from sklearn.model_selection import train_test_split # Для разбития выборки на test и val - тестовую и валидационную
from sklearn.preprocessing import (StandardScaler, # StandartScaler - для нормирования данных (среднее = 0, стандартное отклонение = 1)
                                   OneHotEncoder) # OneHotEncoder для one-hot encoding
from sklearn.decomposition import PCA # Метод главных компонент для удаления выбросов
from sklearn.cluster import KMeans # Для кластеризации класса
from sklearn.model_selection import GridSearchCV # GridSearch для подбора гиперпараметров
from sklearn.metrics import (accuracy_score, # accuracy метрика
                             f1_score, # f1 метрика
                             classification_report, # матрица смежности по precision, recall, f1-score
                             confusion_matrix, # Для построения матрицы смежности (по accuracy)
                             precision_score,
                             recall_score,
                             roc_auc_score
)
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.metrics import precision_recall_curve, auc # ---
from sklearn.pipeline import make_pipeline # Пайплайны для объединения предобработки и предсказания в одну модель

from catboost import CatBoostClassifier # catboost - гвоздь программы

import pickle # Для выгрузки модели

In [3]:
path = 'train.csv'

In [4]:
df = pd.read_csv(path, index_col='id')
df = df.iloc[:5000000]
df

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0
...,...,...,...,...,...,...,...,...,...,...,...
4999995,Female,74,1,50.0,0,1-2 Year,No,2630.0,124.0,146,0
4999996,Female,23,1,50.0,1,< 1 Year,No,35818.0,152.0,211,0
4999997,Female,35,1,37.0,1,1-2 Year,No,30307.0,152.0,226,0
4999998,Female,38,1,37.0,0,1-2 Year,Yes,28302.0,154.0,157,1


In [5]:
X = df.drop(columns='Response')
y = df['Response']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5,
                                                  random_state=42)

data_mask = X_train.Annual_Premium < 100000
X_train = X_train[data_mask]
y_train = y_train[data_mask]

In [6]:
class Preprocess_catboost(BaseEstimator, TransformerMixin):
  def __init__(self):
    return

  def fit(self, X, y=None):
    X_train = X.copy()

    # Добавление Age_square
    self.X_train_age_mean = X_train['Age'].mean()
    X_train['Age_square'] = (X_train['Age'] - self.X_train_age_mean)**2


    # Переводим пол в числа
    self.gender_mapping = {'Male': 1, 'Female': 0}
    X_train['Gender_Code'] = X_train['Gender'].map(self.gender_mapping)

    # Кластеризуем регионы с учетом каналов продаж
    # Группируем по Region_Code и Policy_Sales_Channel
    region_stats = X_train.groupby(['Region_Code', 'Policy_Sales_Channel']).agg({
        'Gender_Code': 'mean',
        'Driving_License': 'mean',
        'Annual_Premium': 'mean',
        'Vintage': 'mean',
        'Age': 'mean',
        'Age_square': 'mean'
    }).reset_index()

    self.kmeans_region = KMeans(n_clusters=7, random_state=42)
    self.kmeans_region = self.kmeans_region.fit(region_stats.drop(['Region_Code', 'Policy_Sales_Channel'], axis=1))

    return self

  def transform(self, X):
    X_test = X.copy()
    int_columns = ['Region_Code', 'Policy_Sales_Channel']
    for col_name in int_columns:
      X_test[col_name] = X_test[col_name].astype('int64')

    X_test['Age_square'] = (X_test['Age'] - self.X_train_age_mean)**2
    X_test['Gender_Code'] = X_test['Gender'].map(self.gender_mapping)

    region_stats_test = X_test.groupby(['Region_Code', 'Policy_Sales_Channel']).agg({
        'Gender_Code': 'mean',
        'Driving_License': 'mean',
        'Annual_Premium': 'mean',
        'Vintage': 'mean',
        'Age': 'mean',
        'Age_square': 'mean'
    }).reset_index()
    region_stats_test['Region_Cluster'] = self.kmeans_region.predict(region_stats_test.drop(['Region_Code', 'Policy_Sales_Channel'], axis=1))
    region_cluster_map_test = region_stats_test.set_index(['Region_Code', 'Policy_Sales_Channel'])['Region_Cluster'].to_dict()
    X_test['Region_Cluster'] = X_test.set_index(['Region_Code', 'Policy_Sales_Channel']).index.map(region_cluster_map_test)

    X_test.drop(['Gender_Code'], axis=1, inplace=True)

    return X_test

In [7]:
categorical_features = ['Gender', 'Driving_License', 'Region_Code',
                        'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage',
                        'Policy_Sales_Channel', 'Region_Cluster']

In [8]:
# Более тщательный подбор гиперпараметров
catboost_model = CatBoostClassifier(
    cat_features=categorical_features,
    iterations=5000,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    random_strength=0.5,  # добавить регуляризацию
    bagging_temperature=0.5,
    early_stopping_rounds=100,
    auto_class_weights='Balanced',  # автоматические веса классов
    verbose=100,
    random_state=42,
    used_ram_limit='6gb',  # Ограничение 6 ГБ
    task_type='CPU'
)

In [9]:
cat_boost_pipeline = make_pipeline(Preprocess_catboost(), catboost_model)

In [10]:
# Берём шаг препроцессинга из пайплайна
pre = cat_boost_pipeline.named_steps['preprocess_catboost']

# Фитим препроцессор руками
pre.fit(X_train)

# Трансформируем валидацию
X_val_proc = pre.transform(X_val)

In [11]:
cat_boost_pipeline.fit(X_train, y_train,
                       catboostclassifier__eval_set=(X_val_proc, y_val))

# Предсказание
y_pred = cat_boost_pipeline.predict(X_val)
f1 = f1_score(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)
print(f"f1: {f1}\naccuracy: {accuracy}")

0:	learn: 0.6454706	test: 0.6454891	best: 0.6454891 (0)	total: 1.56s	remaining: 2h 9m 44s
100:	learn: 0.4086512	test: 0.4093172	best: 0.4093172 (100)	total: 2m 39s	remaining: 2h 9m 3s
200:	learn: 0.4042711	test: 0.4052753	best: 0.4052753 (200)	total: 5m 18s	remaining: 2h 6m 48s
300:	learn: 0.4018332	test: 0.4031866	best: 0.4031866 (300)	total: 7m 53s	remaining: 2h 3m 17s
400:	learn: 0.3999536	test: 0.4015741	best: 0.4015741 (400)	total: 10m 29s	remaining: 2h 24s
500:	learn: 0.3987372	test: 0.4005973	best: 0.4005973 (500)	total: 13m 9s	remaining: 1h 58m 7s
600:	learn: 0.3976246	test: 0.3997480	best: 0.3997480 (600)	total: 15m 47s	remaining: 1h 55m 35s
700:	learn: 0.3968199	test: 0.3992326	best: 0.3992326 (700)	total: 18m 27s	remaining: 1h 53m 8s
800:	learn: 0.3960746	test: 0.3987758	best: 0.3987758 (800)	total: 21m 5s	remaining: 1h 50m 32s
900:	learn: 0.3953652	test: 0.3983886	best: 0.3983886 (900)	total: 23m 46s	remaining: 1h 48m 10s
1000:	learn: 0.3947505	test: 0.3980994	best: 0.39809

CatBoost is using more CPU RAM (7.48GiB) than the limit (6GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(254877072) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage

f1: 0.4566333449493897
accuracy: 0.7299396


In [12]:
# Предсказания вероятностей (нужны для roc_auc)
y_proba = cat_boost_pipeline.predict_proba(X_val)[:, 1]

# Расчёт метрик
metrics = {
    "Accuracy": accuracy_score(y_val, y_pred),
    "Precision": precision_score(y_val, y_pred),
    "Recall": recall_score(y_val, y_pred),
    "F1-score": f1_score(y_val, y_pred),
    "ROC-AUC": roc_auc_score(y_val, y_proba)
}

# Красивый вывод в таблице
metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
print(metrics_df)

      Metric     Value
0   Accuracy  0.729940
1  Precision  0.303438
2     Recall  0.922239
3   F1-score  0.456633
4    ROC-AUC  0.879028


In [13]:
y_proba = cat_boost_pipeline.predict_proba(X_val)[:, 1]

thresholds = np.linspace(0, 1, 200)
scores = [f1_score(y_val, (y_proba >= t).astype(int)) for t in thresholds]

best_t = thresholds[np.argmax(scores)]
best_f1 = max(scores)

print("Best threshold:", best_t)
print("Best F1:", best_f1)

Best threshold: 0.6934673366834171
Best F1: 0.49012460776346006


In [14]:
import pickle
import os

# Путь для сохранения (пример: в Google Drive)
out_path = 'catboost_pipeline.pkl'

# 1) Сохранение (используйте протокол HIGHEST)
with open(out_path, 'wb') as f:
    pickle.dump(cat_boost_pipeline, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Pipeline saved to {out_path}")

# 2) Проверка: загрузка и сравнение предсказаний
with open(out_path, 'rb') as f:
    loaded_pipeline = pickle.load(f)

# Предсказание до и после (предполагаем, что y_pred было вычислено до сохранения)
y_pred_after = loaded_pipeline.predict(X_val)

# простая проверка — совпадают ли предсказания
print("Same predictions after load:", np.array_equal(y_pred, y_pred_after))

Pipeline saved to catboost_pipeline.pkl
Same predictions after load: True


In [15]:
import sklearn
import catboost
import json, sys

meta = {
  "features": list(X_train.columns),
  "categorical_features": categorical_features,
  "pandas": pd.__version__,
  "sklearn": sklearn.__version__,
  "catboost": catboost.__version__,
  "python": sys.version
}
with open('catboost_meta.json','w') as f: json.dump(meta, f)

In [16]:
# Выводим предсказания по новому порогу
y_pred_custom = (y_proba >= best_t).astype(int)

In [17]:
# Расчёт метрик
metrics = {
    "Accuracy": accuracy_score(y_val, y_pred_custom),
    "Precision": precision_score(y_val, y_pred_custom),
    "Recall": recall_score(y_val, y_pred_custom),
    "F1-score": f1_score(y_val, y_pred_custom),
    "ROC-AUC": roc_auc_score(y_val, y_proba)
}

# Красивый вывод в таблице
metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
print(metrics_df)

      Metric     Value
0   Accuracy  0.815802
1  Precision  0.371642
2     Recall  0.719509
3   F1-score  0.490125
4    ROC-AUC  0.879028


In [2]:
import sys
sys.version

'3.10.13 (main, Mar 12 2024, 12:16:25) [GCC 12.2.0]'

In [3]:
pip freeze

alembic==1.17.2
annotated-doc==0.0.4
annotated-types==0.7.0
anyio==4.11.0
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
arrow==1.4.0
asttokens==3.0.1
async-lru==2.0.5
attrs==25.4.0
babel==2.17.0
beautifulsoup4==4.14.2
bleach==6.3.0
blinker==1.9.0
cachetools==6.2.2
catboost==1.2.8
certifi==2025.11.12
cffi==2.0.0
charset-normalizer==3.4.4
click==8.3.1
cloudpickle==3.1.2
comm==0.2.3
contourpy==1.3.2
cryptography==46.0.3
cycler==0.12.1
databricks-sdk==0.73.0
debugpy==1.8.17
decorator==5.2.1
defusedxml==0.7.1
docker==7.1.0
exceptiongroup==1.3.1
executing==2.2.1
fastapi==0.122.0
fastjsonschema==2.21.2
filelock==3.19.1
Flask==3.1.2
flask-cors==6.0.1
fonttools==4.60.1
fqdn==1.5.1
fsspec==2025.9.0
gitdb==4.0.12
GitPython==3.1.45
google-auth==2.43.0
graphene==3.4.3
graphql-core==3.2.7
graphql-relay==3.2.0
graphviz==0.21
greenlet==3.2.4
gunicorn==23.0.0
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
huey==2.5.4
idna==3.11
imbalanced-learn==0.14.0
importlib_metadata==8.7.0
ipykernel==7.1.0
ipython==