In [None]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
from sklearn.model_selection import train_test_split
from main_graph import BaseGraph

In [None]:
main_df = BaseGraph()
df_install_main,df_install_s_2 = main_df.get_main_df()

In [None]:
df_install_main = df_install_main.drop(columns='Unnamed: 0')

In [None]:
df_install_main


In [None]:
df_install_main.info()

In [None]:
df = df_install_main.copy()

In [None]:
df['install_time'] = pd.to_datetime(df['install_time'], dayfirst=True)
df['contributor_1_touch_time'] = pd.to_datetime(df['contributor_1_touch_time'], dayfirst=True)
df['contributor_2_touch_time'] = pd.to_datetime(df['contributor_2_touch_time'], dayfirst=True)

# временные признаки
df['hours_from_contrib_1'] = (df['install_time'] - df['contributor_1_touch_time']).dt.total_seconds() / 3600
df['hours_from_contrib_2'] = (df['install_time'] - df['contributor_2_touch_time']).dt.total_seconds() / 3600
df['install_weekday'] = df['install_time'].dt.weekday
df['install_hour'] = df['install_time'].dt.hour

# признаки для CatBoost
X = df[['contributor_1', 'contributor_2', 'hours_from_contrib_1', 'hours_from_contrib_2', 'install_weekday', 'install_hour']]
y = df['source']

cat_features = ['contributor_1', 'contributor_2']

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Теперь делим train+valid на train и validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42, stratify=y_trainval
)

# Проверяем размеры
print(X_train.shape, X_valid.shape, X_test.shape)

In [None]:
cat_features = ['contributor_1', 'contributor_2']

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    early_stopping_rounds=50,
    verbose=100
)

model.fit(train_pool, eval_set=valid_pool)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy on test:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))