# Импорт библиотек

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
from catboost import CatBoostClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Импорт датасетов

In [None]:
train = pd.read_parquet('train.parquet')

In [None]:
test = pd.read_parquet('test.parquet')

# EDA

In [None]:
train.head()

Unnamed: 0,feature_290,feature_28,feature_33,feature_325,feature_101,feature_240,feature_388,feature_348,feature_26,feature_471,...,feature_147,feature_464,feature_478,feature_281,feature_319,feature_50,feature_259,a6_flg,month_dt,product
0,,3.5302,1.745249,0.0,3.087599,-0.03987,-0.314415,-0.002717,,-1.286581,...,0.170569,0.315611,1.007309,1.155886,0.028282,-1.849085,-0.846905,0.0,2022-09-01,product_1
1,,0.839291,1.871767,0.0,0.57071,-0.047786,1.861265,,,-0.536268,...,,,,,,,,0.0,2022-09-01,product_1
2,,0.677735,2.00332,0.0,0.114468,-0.047786,0.630311,-0.003127,,-1.807696,...,-0.477683,-0.446283,0.486735,0.389396,-0.331142,0.072091,0.504234,1.0,2022-09-01,product_1
3,,-0.27769,,0.0,,-0.047786,1.540682,-0.002329,,,...,-0.77971,-0.737856,0.613304,0.655314,-0.137146,-0.792167,1.858767,1.0,2022-09-01,product_1
4,,1.397782,,0.017316,,-0.047786,,-0.002503,,,...,-1.062844,-1.02177,0.412071,0.500406,0.553212,-0.419345,1.244381,0.0,2022-09-01,product_1


In [None]:
null_train = pd.DataFrame({'column_name': column, 'num_null': train[column].isnull().sum(), 'null_pct': (train[column].isnull().sum()/len(train))*100} for column in train.columns ).sort_values(by='num_null', ascending=False)
null_train.loc[null_train.num_null>0].reset_index(drop=True)

Unnamed: 0,column_name,num_null,null_pct
0,feature_269,461126,100.000000
1,feature_231,461126,100.000000
2,feature_20,461126,100.000000
3,feature_110,461126,100.000000
4,feature_233,461126,100.000000
...,...,...,...
481,feature_165,47,0.010192
482,feature_334,47,0.010192
483,feature_106,47,0.010192
484,feature_6,47,0.010192


In [None]:
#удалим признаки со 100% пропущенных значений
columns_to_drop = [col for col in train.columns if train[col].isnull().sum() == len(train)]

train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)

print(f"Удалено пустых столбцов: {len(columns_to_drop)}")

Удалено пустых столбцов: 12


In [None]:
null_train = pd.DataFrame({'column_name': column, 'num_null': train[column].isnull().sum(), 'null_pct': (train[column].isnull().sum()/len(train))*100} for column in train.columns ).sort_values(by='num_null', ascending=False)
null_train.loc[null_train.num_null>0].reset_index(drop=True)

Unnamed: 0,column_name,num_null,null_pct
0,feature_176,458324,99.392357
1,feature_122,443641,96.208195
2,feature_19,443641,96.208195
3,feature_446,442105,95.875097
4,feature_212,432373,93.764611
...,...,...,...
469,feature_111,47,0.010192
470,feature_380,47,0.010192
471,feature_277,47,0.010192
472,feature_361,47,0.010192


In [None]:
#смотрим корреляции
target_col = 'a6_flg'
columns_to_exclude = [target_col, 'product']

correlations = train.drop(columns_to_exclude, axis=1).apply(
    lambda x: x.corr(train[target_col])
)

corr_df = pd.DataFrame({
    'feature': correlations.index,
    'correlation': correlations.values,
    'abs_correlation': np.abs(correlations.values)
}).sort_values('abs_correlation', ascending=False)

print(corr_df)

         feature  correlation  abs_correlation
170  feature_331     0.202995         0.202995
293  feature_308     0.193533         0.193533
23   feature_159     0.179699         0.179699
153   feature_55    -0.177400         0.177400
466   feature_73     0.174868         0.174868
..           ...          ...              ...
160  feature_176          NaN              NaN
187  feature_380          NaN              NaN
236  feature_277          NaN              NaN
323  feature_193          NaN              NaN
371  feature_165          NaN              NaN

[475 rows x 3 columns]


# Feature engineering

In [None]:
#добавляем дополнительные признаки
def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if df['month_dt'].dtype == 'object':
        df['month_dt'] = pd.to_datetime(df['month_dt'])

    df['month'] = df['month_dt'].dt.month
    df['year'] = df['month_dt'].dt.year
    df['quarter'] = df['month_dt'].dt.quarter
    df['year_month'] = df['month_dt'].dt.year * 100 + df['month_dt'].dt.month

    return df

Предобработка + агрегации

In [None]:
#временные признаки
train = add_time_features(train)
test = add_time_features(test)

#product как категориальная фича
train['product'] = train['product'].astype('category')
test['product'] = test['product'].astype('category')

target_col = 'a6_flg'

#агрегации по product/month/year
#по продукту
prod_stats = (
    train
    .groupby('product')[target_col]
    .agg(['mean', 'count', 'std'])
    .rename(columns={
        'mean': 'product_target_mean',
        'count': 'product_target_count',
        'std': 'product_target_std'
    })
)

#по месяцу
month_stats = (
    train
    .groupby('month')[target_col]
    .agg(['mean', 'count', 'std'])
    .rename(columns={
        'mean': 'month_target_mean',
        'count': 'month_target_count',
        'std': 'month_target_std'
    })
)

#по году
year_stats = (
    train
    .groupby('year')[target_col]
    .agg(['mean', 'count', 'std'])
    .rename(columns={
        'mean': 'year_target_mean',
        'count': 'year_target_count',
        'std': 'year_target_std'
    })
)

In [None]:
#мерджим в train и test
train = train.merge(prod_stats, on='product', how='left')
test = test.merge(prod_stats, on='product', how='left')

train = train.merge(month_stats, on='month', how='left')
test = test.merge(month_stats, on='month', how='left')

train = train.merge(year_stats, on='year', how='left')
test = test.merge(year_stats, on='year', how='left')

In [None]:
#возможные NaN в std заполняем нулями (если только один объект в группе)
for col in ['product_target_std', 'month_target_std', 'year_target_std']:
    train[col] = train[col].fillna(0.0)
    test[col] = test[col].fillna(0.0)

#frequency encoding для product (для LGBM)
product_freq = train['product'].value_counts()
train['product_freq'] = train['product'].map(product_freq).astype('float32')
test['product_freq'] = test['product'].map(product_freq)
test['product_freq'] = test['product_freq'].astype('float32').fillna(0.0)


print("\nРазмерности после агрегаций:")
print("Train:", train.shape, "Test:", test.shape)


Размерности после агрегаций:
Train: (461126, 491) Test: (38874, 490)


In [None]:
#финальное формирование матриц признаков
y_full = train[target_col].values
X_full = train.drop(columns=[target_col, 'month_dt'])
X_test_final = test.drop(columns=['month_dt'])

#индексы категориальных фичей для CatBoost
cat_features_cb = [X_full.columns.get_loc('product')]

print("\nКатфичи для CatBoost (по индексам):", cat_features_cb)
print("Тип product:", X_full['product'].dtype)


Катфичи для CatBoost (по индексам): [474]
Тип product: category


# KFold + OOF предсказания для стекинга

In [None]:
n_splits = 10
use_time_split = True #тут разные подходы пробовал

if use_time_split:
    order = np.argsort(X_full['year_month'].values)
    X_cv = X_full.iloc[order].reset_index(drop=True)
    y_cv = y_full[order]

    groups = X_cv['year_month'].values

    splitter = GroupKFold(n_splits=n_splits)
    split_iter = splitter.split(X_cv, y_cv, groups=groups)
else:
    X_cv = X_full
    y_cv = y_full

    splitter = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=42
    )
    split_iter = splitter.split(X_cv, y_cv)

In [None]:
#контейнеры для OOF и предсказаний на тесте
oof_cb = np.zeros(len(X_cv))
oof_lgb = np.zeros(len(X_cv))
test_pred_cb_folds = np.zeros((len(X_test_final), n_splits))
test_pred_lgb_folds = np.zeros((len(X_test_final), n_splits))

In [None]:
for fold, (tr_idx, val_idx) in enumerate(split_iter):
    print(f"\n===== Fold {fold + 1}/{n_splits} =====")

    X_tr, X_val = X_cv.iloc[tr_idx], X_cv.iloc[val_idx]
    y_tr, y_val = y_cv[tr_idx], y_cv[val_idx]

    X_tr_lgb = X_tr.copy()
    X_val_lgb = X_val.copy()
    X_test_lgb = X_test_final.copy()

    X_tr_lgb['product'] = X_tr_lgb['product'].cat.codes
    X_val_lgb['product'] = X_val_lgb['product'].cat.codes
    X_test_lgb['product'] = X_test_lgb['product'].cat.codes

    # --- CatBoost ---
    cb_model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.03,
        depth=6,
        l2_leaf_reg=3,
        bootstrap_type='Bernoulli',
        subsample=0.8,
        random_seed=42,
        eval_metric='AUC',
        verbose=False
    )

    cb_model.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        cat_features=cat_features_cb,
        early_stopping_rounds=200,
        verbose=False
    )

    # --- LightGBM ---
    lgb_model = LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=6,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=-1
    )

    lgb_model.fit(
        X_tr_lgb, y_tr,
        eval_set=[(X_val_lgb, y_val)],
        callbacks=[lgb.early_stopping(200, verbose=False)]
    )

    oof_cb[val_idx] = cb_model.predict_proba(X_val)[:, 1]
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val_lgb)[:, 1]

    test_pred_cb_folds[:, fold] = cb_model.predict_proba(X_test_final)[:, 1]
    test_pred_lgb_folds[:, fold] = lgb_model.predict_proba(X_test_lgb)[:, 1]


===== Fold 1/10 =====

===== Fold 2/10 =====

===== Fold 3/10 =====

===== Fold 4/10 =====

===== Fold 5/10 =====

===== Fold 6/10 =====

===== Fold 7/10 =====

===== Fold 8/10 =====

===== Fold 9/10 =====

===== Fold 10/10 =====


In [None]:
#качество базовых моделей по OOF
print("\n===== OOF качество базовых моделей =====")
print("CatBoost OOF AUC:", roc_auc_score(y_cv, oof_cb))
print("LGBM    OOF AUC:", roc_auc_score(y_cv, oof_lgb))

#матрицы признаков для мета-модели
train_meta = np.column_stack([oof_cb, oof_lgb])
test_meta = np.column_stack([
    test_pred_cb_folds.mean(axis=1),
    test_pred_lgb_folds.mean(axis=1)
])


===== OOF качество базовых моделей =====
CatBoost OOF AUC: 0.7415352824690697
LGBM    OOF AUC: 0.7424047462863042


In [None]:
meta_model = LogisticRegression(
    C=1.0,
    max_iter=1000,
    random_state=42
)
meta_model.fit(train_meta, y_cv)

oof_meta_pred = meta_model.predict_proba(train_meta)[:, 1]
print("\nMeta-model OOF AUC:", roc_auc_score(y_cv, oof_meta_pred))
print(f"Веса мета-модели:")
print(f"CatBoost: {meta_model.coef_[0][0]:.6f}")
print(f"LightGBM: {meta_model.coef_[0][1]:.6f}")
print(f"Intercept: {meta_model.intercept_[0]:.6f}")


Meta-model OOF AUC: 0.7433243626571141
Веса мета-модели:
CatBoost: 2.382565
LightGBM: 2.511691
Intercept: -2.436328


# Предсказания на test:

In [None]:
# финальные предсказания на test
final_test_pred = meta_model.predict_proba(test_meta)[:, 1]
print("\nBlended test predictions: mean =", final_test_pred.mean())

print(f"Средние предсказания:")
print(f"CatBoost: {test_pred_cb_folds.mean(axis=1).mean():.6f}")
print(f"LightGBM: {test_pred_lgb_folds.mean(axis=1).mean():.6f}")
print(f"Blending: {final_test_pred.mean():.6f}")


Blended test predictions: mean = 0.31506447655873726
Средние предсказания:
CatBoost: 0.320825
LightGBM: 0.312442
Blending: 0.315064


In [None]:
submission = pd.DataFrame({
    'index': range(len(final_test_pred)),
    'a6_flg': final_test_pred
})

submission.to_csv('submission.csv', index=False)
print(f"Результаты сохранены в submission.csv")
print(f"Shape: {submission.shape}")
print(submission.head(10))

Результаты сохранены в submission.csv
Shape: (38874, 2)
   index    a6_flg
0      0  0.245619
1      1  0.409190
2      2  0.170212
3      3  0.372140
4      4  0.225812
5      5  0.300418
6      6  0.828569
7      7  0.364433
8      8  0.165605
9      9  0.160692
