Позаимствовали агрегированные фичи у kdimon15. Спасибо! <br>
https://github.com/kdimon15/data-fusion-2024-baseline/

In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import random
import warnings

warnings.simplefilter('ignore')

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

random.seed(42)
np.random.seed(42)

In [2]:
main = pd.read_csv('../data/train.csv')

sample = pd.read_csv('../data/sample_submit_naive.csv').drop('predict', axis=1)
sample['target'] = -1

main = pd.concat([main, sample])

In [4]:
clients = pd.read_csv('../data/clients.csv')
report_dates = pd.read_csv('../data/report_dates.csv', parse_dates=['report_dt'])

transactions = pd.read_csv('../data/transactions.csv', parse_dates=['transaction_dttm'])
transactions = transactions.sort_values('transaction_dttm').reset_index(drop=True)

In [6]:
main = main.merge(clients, how='left', on='user_id')
main['employee_count_nm'] = LabelEncoder().fit_transform(main['employee_count_nm'].fillna('unknown'))

In [7]:
good_codes = transactions['mcc_code'].value_counts()
good_codes = good_codes[good_codes >= 10]

mcc_info = transactions[transactions.mcc_code.isin(good_codes)].pivot_table(
    index = 'user_id',
    values=['transaction_amt'],
    columns=['mcc_code'],
    aggfunc=['count', 'median', 'sum']
).fillna(0)
mcc_info.columns = ['main_' + '_'.join(map(str, x)) for x in mcc_info.columns]

count_cols = [x for x in mcc_info.columns if 'count' in x]
mcc_info['sum'] = mcc_info[count_cols].sum(axis=1)
for col in count_cols:
    mcc_info[f'{col}_norm'] = mcc_info[col] / mcc_info['sum']
mcc_info.drop('sum', axis=1, inplace=True)

main = main.merge(mcc_info, how='left', left_on='user_id', right_index=True)

In [9]:
df_more = transactions.merge(clients[['user_id', 'report']], how='left', on='user_id')
df_more = df_more.merge(report_dates, how='left', on='report')
df_more['days_to_report'] = (df_more['report_dt'] - df_more['transaction_dttm']).dt.days

for day_diff in [30, 1000]:
    currency_pivot = df_more[df_more['days_to_report'] < day_diff + 100].pivot_table(
        index='user_id',
        columns='currency_rk',
        values='transaction_amt',
        aggfunc=['sum', 'mean', 'median', 'count']
    ).fillna(0)
    currency_pivot.columns = [f'currency_daydiff_{day_diff}_{x[0]}_{x[1]}' for x in currency_pivot.columns]

    currency_pivot['sum'] = currency_pivot[[x for x in currency_pivot.columns if 'count' in x]].sum(axis=1)
    for x in range(4):
        currency_pivot[f'currency_daydiff_{day_diff}_count_{x}_norm'] = currency_pivot[f'currency_daydiff_{day_diff}_count_{x}'] / currency_pivot['sum']
    currency_pivot.drop('sum', axis=1, inplace=True)

    main = main.merge(currency_pivot, how='left', left_on='user_id', right_index=True)


    general_trans_info = df_more[df_more['days_to_report'] < day_diff + 100].groupby('user_id')['transaction_amt'].agg(['sum', 'count', 'median'])
    general_trans_info[['sum', 'count']] = general_trans_info[['sum', 'count']].fillna(0)
    general_trans_info.columns = [f'general_trans_info_{day_diff}_{x}' for x in general_trans_info]
    main = main.merge(general_trans_info, how='left', left_on='user_id', right_index=True)

    general_trans_info = df_more[(df_more['days_to_report']<day_diff + 100)&(df_more['transaction_amt']>0)].groupby('user_id')['transaction_amt'].agg(['sum', 'count', 'median'])
    general_trans_info[['sum', 'count']] = general_trans_info[['sum', 'count']].fillna(0)
    general_trans_info.columns = [f'positive_general_trans_info_{day_diff}_{x}' for x in general_trans_info]
    main = main.merge(general_trans_info, how='left', left_on='user_id', right_index=True)

    general_trans_info = df_more[(df_more['days_to_report']<day_diff + 100)&(df_more['transaction_amt']<0)].groupby('user_id')['transaction_amt'].agg(['sum', 'count', 'median'])
    general_trans_info[['sum', 'count']] = general_trans_info[['sum', 'count']].fillna(0)
    general_trans_info.columns = [f'negative_general_trans_info_{day_diff}_{x}' for x in general_trans_info]
    main = main.merge(general_trans_info, how='left', left_on='user_id', right_index=True)


# Анализируем кол-во транзакций в последние n дней / кол-во транзакций до последних n дней
for x in [5, 30]:
    prev = df_more[df_more['days_to_report'] > x + 100].groupby('user_id')['report'].agg(['count']).reset_index().rename({'count': f'num_transaction_before_{x}_days'}, axis=1)
    last = df_more[df_more['days_to_report'] <= x + 100].groupby('user_id')['report'].agg(['count']).reset_index().rename({'count': f'num_transaction_last_{x}_days'}, axis=1)

    main = main.merge(prev, how='left', on='user_id')
    main = main.merge(last, how='left', on='user_id')
    main[f'num_transaction_last_{x}_days'].fillna(0, inplace=True)
    main[f'num_transaction_before_{x}_days'].fillna(0, inplace=True)
    main[f'percent_last_{x}'] = main[f'num_transaction_last_{x}_days'] / main[f'num_transaction_before_{x}_days']

In [10]:
# Кол-во уникальных MCC кодов, валют, а также уникальных дней, в которые были транзакции
main = main.merge(df_more.groupby('user_id')['days_to_report'].nunique(), how='left', on='user_id').rename({'days_to_report': 'nunique_days'}, axis=1)
main = main.merge(df_more.groupby('user_id')['mcc_code'].nunique(), how='left', on='user_id').rename({'mcc_code': 'nunique_mcc_codes'}, axis=1)
main = main.merge(df_more.groupby('user_id')['currency_rk'].nunique(), how='left', on='user_id').rename({'currency_rk': 'nunique_currency'}, axis=1)

main

Unnamed: 0,user_id,target,time,report,employee_count_nm,bankemplstatus,customer_age,main_count_transaction_amt_10,main_count_transaction_amt_11,main_count_transaction_amt_12,main_count_transaction_amt_15,main_count_transaction_amt_16,main_count_transaction_amt_17,main_count_transaction_amt_18,main_count_transaction_amt_22,main_count_transaction_amt_23,main_count_transaction_amt_26,main_count_transaction_amt_28,main_count_transaction_amt_29,main_count_transaction_amt_31,main_count_transaction_amt_32,main_count_transaction_amt_33,main_count_transaction_amt_34,main_count_transaction_amt_39,main_count_transaction_amt_42,main_count_transaction_amt_44,main_count_transaction_amt_50,main_count_transaction_amt_51,main_count_transaction_amt_53,main_count_transaction_amt_54,main_count_transaction_amt_55,main_count_transaction_amt_56,main_count_transaction_amt_58,main_count_transaction_amt_59,main_count_transaction_amt_63,main_count_transaction_amt_65,main_count_transaction_amt_66,main_count_transaction_amt_72,main_count_transaction_amt_76,main_count_transaction_amt_77,main_count_transaction_amt_78,main_count_transaction_amt_81,main_count_transaction_amt_82,main_count_transaction_amt_85,main_count_transaction_amt_92,main_count_transaction_amt_95,main_count_transaction_amt_105,main_count_transaction_amt_111,main_count_transaction_amt_119,main_count_transaction_amt_122,...,currency_daydiff_30_count_1_norm,currency_daydiff_30_count_2_norm,currency_daydiff_30_count_3_norm,general_trans_info_30_sum,general_trans_info_30_count,general_trans_info_30_median,positive_general_trans_info_30_sum,positive_general_trans_info_30_count,positive_general_trans_info_30_median,negative_general_trans_info_30_sum,negative_general_trans_info_30_count,negative_general_trans_info_30_median,currency_daydiff_1000_sum_0,currency_daydiff_1000_sum_1,currency_daydiff_1000_sum_2,currency_daydiff_1000_sum_3,currency_daydiff_1000_mean_0,currency_daydiff_1000_mean_1,currency_daydiff_1000_mean_2,currency_daydiff_1000_mean_3,currency_daydiff_1000_median_0,currency_daydiff_1000_median_1,currency_daydiff_1000_median_2,currency_daydiff_1000_median_3,currency_daydiff_1000_count_0,currency_daydiff_1000_count_1,currency_daydiff_1000_count_2,currency_daydiff_1000_count_3,currency_daydiff_1000_count_0_norm,currency_daydiff_1000_count_1_norm,currency_daydiff_1000_count_2_norm,currency_daydiff_1000_count_3_norm,general_trans_info_1000_sum,general_trans_info_1000_count,general_trans_info_1000_median,positive_general_trans_info_1000_sum,positive_general_trans_info_1000_count,positive_general_trans_info_1000_median,negative_general_trans_info_1000_sum,negative_general_trans_info_1000_count,negative_general_trans_info_1000_median,num_transaction_before_5_days,num_transaction_last_5_days,percent_last_5,num_transaction_before_30_days,num_transaction_last_30_days,percent_last_30,nunique_days,nunique_mcc_codes,nunique_currency
0,3,0,77.0,2,4,0,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,175726.502930,4.0,33163.771484,175726.502930,4.0,33163.771484,,,,0.000000,13706.416641,0.0,0.0,0.000000,1246.037876,0.0,0.0,0.000000,4549.455078,0.0,0.0,0.0,11.0,0.0,0.0,0.000000,1.000000,0.0,0.0,13706.416641,11,4549.455078,186108.229797,7.0,5386.999023,-172401.813156,4.0,-9175.519287,11.0,0.0,0.000000,7.0,4.0,0.571429,8,4,1
1,13,0,86.0,6,8,0,2,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-5588.771484,2.0,-2794.385742,10805.421875,1.0,10805.421875,-16394.193359,1.0,-16394.193359,10772.799805,-135490.178955,0.0,0.0,10772.799805,-6451.913284,0.0,0.0,10772.799805,-10642.210938,0.0,0.0,1.0,21.0,0.0,0.0,0.045455,0.954545,0.0,0.0,-124717.379150,22,-10529.004883,128766.684326,8.0,10789.110840,-253484.063477,14.0,-16423.615234,22.0,0.0,0.000000,20.0,2.0,0.100000,18,4,2
2,37,0,89.0,5,1,0,2,4.0,0.0,0.0,2.0,1.0,0.0,1.0,5.0,0.0,0.0,7.0,0.0,0.0,4.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-48048.763298,41.0,-267.658539,,,,-48048.763298,41.0,-267.658539,0.000000,-331859.599463,0.0,0.0,0.000000,-1053.522538,0.0,0.0,0.000000,-236.420776,0.0,0.0,0.0,315.0,0.0,0.0,0.000000,1.000000,0.0,0.0,-331859.599463,315,-236.420776,10738.788574,2.0,5369.394287,-342598.388037,313.0,-236.546936,313.0,2.0,0.006390,273.0,42.0,0.153846,130,28,1
3,41,0,57.0,1,4,0,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-8045.445801,2.0,-4022.722900,,,,-8045.445801,2.0,-4022.722900,0.000000,-108586.614166,0.0,0.0,0.000000,-6786.663385,0.0,0.0,0.000000,-6328.293701,0.0,0.0,0.0,16.0,0.0,0.0,0.000000,1.000000,0.0,0.0,-108586.614166,16,-6328.293701,,,,-108586.614166,16.0,-6328.293701,14.0,2.0,0.142857,14.0,2.0,0.142857,12,5,1
4,42,0,84.0,12,3,0,3,2.0,4.0,7.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,23617.963127,13.0,-187.755142,51152.999939,4.0,2277.169434,-27535.036812,9.0,-1037.949341,0.000000,11429.587215,0.0,0.0,0.000000,193.721817,0.0,0.0,0.000000,-321.756958,0.0,0.0,0.0,59.0,0.0,0.0,0.000000,1.000000,0.0,0.0,11429.587215,59,-321.756958,72779.679138,11.0,2706.099609,-61350.091923,48.0,-528.145752,49.0,10.0,0.204082,46.0,13.0,0.282609,38,20,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,561362,-1,,12,0,0,3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-9802.526207,14.0,-434.536697,,,,-9802.526207,14.0,-434.536697,0.000000,-71254.860472,0.0,0.0,0.000000,-719.746065,0.0,0.0,0.000000,-467.705963,0.0,0.0,0.0,99.0,0.0,0.0,0.000000,1.000000,0.0,0.0,-71254.860472,99,-467.705963,,,,-71254.860472,99.0,-467.705963,99.0,0.0,0.000000,84.0,15.0,0.178571,49,12,1
95996,561419,-1,,12,0,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-6571.396210,6.0,-1069.419952,,,,-6571.396210,6.0,-1069.419952,0.000000,-3495.118294,0.0,0.0,0.000000,-48.543310,0.0,0.0,0.000000,-474.838638,0.0,0.0,0.0,72.0,0.0,0.0,0.000000,1.000000,0.0,0.0,-3495.118294,72,-474.838638,69579.185341,5.0,109.552872,-73074.303635,67.0,-542.066284,72.0,0.0,0.000000,65.0,7.0,0.107692,54,9,1
95997,561895,-1,,12,0,0,2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-179350.281706,19.0,-1134.675049,,,,-179350.281706,19.0,-1134.675049,0.000000,-717608.803839,0.0,0.0,0.000000,-18400.225739,0.0,0.0,0.000000,-1422.766357,0.0,0.0,0.0,39.0,0.0,0.0,0.000000,1.000000,0.0,0.0,-717608.803839,39,-1422.766357,,,,-717608.803839,39.0,-1422.766357,36.0,3.0,0.083333,19.0,20.0,1.052632,24,15,1
95998,561908,-1,,12,0,0,2,2.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-88454.045708,20.0,-391.408051,54908.425781,1.0,54908.425781,-143362.471489,19.0,-460.255188,0.000000,778253.475967,0.0,0.0,0.000000,13897.383499,0.0,0.0,0.000000,-223.971062,0.0,0.0,0.0,56.0,0.0,0.0,0.000000,1.000000,0.0,0.0,778253.475967,56,-223.971062,936792.233398,10.0,53917.962891,-158538.757431,46.0,-315.139267,44.0,12.0,0.272727,36.0,20.0,0.555556,24,12,1


In [11]:
tmp = transactions.copy()
tmp['hour'] = tmp['transaction_dttm'].dt.hour
pivot_table = tmp.pivot_table(
    index='user_id',
    columns='hour',
    values='transaction_amt',
    aggfunc=['count', 'median']
).fillna(0)
pivot_table.columns = [f'hour_{x[0]}_{x[1]}' for x in pivot_table.columns]

count_cols = [x for x in pivot_table.columns if 'count' in x]
pivot_table['sum'] = pivot_table[count_cols].sum(axis=1)
for col in count_cols:
    pivot_table[f'{col}_norm'] = pivot_table[col] / pivot_table['sum']
pivot_table.drop('sum', axis=1, inplace=True)

main = main.merge(pivot_table, how='left', left_on='user_id', right_index=True)

In [12]:
cur = transactions.groupby('user_id')['transaction_dttm'].agg(['min', 'max']).reset_index()
cur = cur.merge(clients[['user_id', 'report']], how='left', on='user_id')
cur = cur.merge(report_dates, how='left', on='report')

cur['min_diff_dttm'] = (cur['report_dt'] - cur['min']).dt.days
cur['days_to_report'] = (cur['report_dt'] - cur['max']).dt.days
cur['max_min_diff_dttm'] = cur['days_to_report'] - cur['min_diff_dttm']

main = main.merge(cur[['user_id', 'min_diff_dttm','days_to_report','max_min_diff_dttm']], how='left', on='user_id')

In [13]:
main['trx_density'] = main['max_min_diff_dttm'] / main['general_trans_info_1000_count']
main['days_density'] = (main['max_min_diff_dttm'] + 1) / main['nunique_days']

In [None]:
embeddings_path = f'../output_embs/coles_emb.csv'
main_embs = pd.read_csv(embeddings_path)

wtte_embeddings_path = f'../output_embs/wtte_embs.csv'
wtte_embs = pd.read_csv(wtte_embeddings_path)
main_embs = main_embs.merge(wtte_embs, on='user_id')

In [19]:
cat_cols = ['customer_age', 'employee_count_nm', 'report']
main[cat_cols] = main[cat_cols].astype(str)

main = main.sort_values('user_id').reset_index(drop=True)
train = main[main.target != -1]
test = main[main.target == -1]

Отбор фичей вместе с отбором компонент эмбеддингов

In [20]:
train_with_embs = train.merge(main_embs, on='user_id')
test_with_embs = test.merge(main_embs, on='user_id')
train_with_embs.fillna(-999, inplace=True)
test_with_embs.fillna(-999, inplace=True)

In [21]:
from sklearn.model_selection import train_test_split
train_, test_ = train_test_split(train_with_embs, random_state=42, test_size=0.2)

Обучение модельки для того чтобы получить важные фичи

In [22]:
model = CatBoostRegressor(
    iterations = 1400,
    depth=3,
    learning_rate=0.03,
    cat_features = cat_cols,
    eval_metric='MSLE',
    thread_count=6,
    early_stopping_rounds=200,
    task_type="GPU",
)
model.fit(train_.drop(['user_id', 'target', 'time',], axis=1), train_['time'], verbose=100)

df_imp = pd.DataFrame({
    'name': train_.drop(['user_id', 'target', 'time',], axis=1).columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

df_imp = df_imp[df_imp['imp'] > 0.15] 

good_cols = df_imp['name'].tolist()

Default metric period is 5 because MSLE is/are not implemented for GPU
Metric MSLE is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.1759534	total: 29.8ms	remaining: 41.7s
100:	learn: 0.1622665	total: 896ms	remaining: 11.5s
200:	learn: 0.1601656	total: 1.81s	remaining: 10.8s
300:	learn: 0.1592005	total: 2.72s	remaining: 9.92s
400:	learn: 0.1584804	total: 3.56s	remaining: 8.88s
500:	learn: 0.1579344	total: 4.41s	remaining: 7.91s
600:	learn: 0.1574006	total: 5.25s	remaining: 6.98s
700:	learn: 0.1568992	total: 6.11s	remaining: 6.09s
800:	learn: 0.1564536	total: 6.95s	remaining: 5.2s
900:	learn: 0.1560776	total: 7.82s	remaining: 4.33s
1000:	learn: 0.1556474	total: 8.67s	remaining: 3.46s
1100:	learn: 0.1552361	total: 9.52s	remaining: 2.58s
1200:	learn: 0.1548850	total: 10.4s	remaining: 1.72s
1300:	learn: 0.1545214	total: 11.2s	remaining: 854ms
1399:	learn: 0.1541761	total: 12.1s	remaining: 0us


In [26]:
cat_cols = ['employee_count_nm', 'customer_age']

Обучение основных моделей на 5 Фолдах. Стратификация по report, возможно следует попробовать что нибудь другое:)

In [27]:
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X, y = train_.drop(['time',], axis=1), train_['time']
time_models = []

for train_index, valid_index in strat_kfold.split(train_, train_['target']):
    
    X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_val = y.iloc[train_index], y.iloc[valid_index]

    model = CatBoostRegressor(
        iterations = 1500,
        depth=5,
        learning_rate=0.05,
        cat_features = cat_cols,
        early_stopping_rounds=400,
        random_seed = 42,
        eval_metric='MSLE',
        task_type="GPU",
    )

    model.fit(Pool(X_train[good_cols], y_train, cat_features=cat_cols),
              eval_set=Pool(X_val[good_cols], y_val, cat_features=cat_cols),
              verbose=100)
    time_models.append(model)

Default metric period is 5 because MSLE is/are not implemented for GPU
Metric MSLE is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.1777969	test: 0.1660498	best: 0.1660498 (0)	total: 19.8ms	remaining: 29.7s
100:	learn: 0.1608645	test: 0.1514337	best: 0.1514337 (100)	total: 1.6s	remaining: 22.1s
200:	learn: 0.1587968	test: 0.1504114	best: 0.1504114 (200)	total: 3.15s	remaining: 20.4s
300:	learn: 0.1571517	test: 0.1499788	best: 0.1499711 (299)	total: 4.75s	remaining: 18.9s
400:	learn: 0.1557983	test: 0.1497435	best: 0.1497345 (395)	total: 6.36s	remaining: 17.4s
500:	learn: 0.1545023	test: 0.1495814	best: 0.1495601 (463)	total: 7.96s	remaining: 15.9s
600:	learn: 0.1533756	test: 0.1495146	best: 0.1495146 (600)	total: 9.53s	remaining: 14.3s
700:	learn: 0.1523912	test: 0.1494483	best: 0.1494478 (696)	total: 11.1s	remaining: 12.6s
800:	learn: 0.1513677	test: 0.1493955	best: 0.1493931 (792)	total: 12.6s	remaining: 11s
900:	learn: 0.1504736	test: 0.1493685	best: 0.1493416 (885)	total: 14.2s	remaining: 9.42s
1000:	learn: 0.1496070	test: 0.1493127	best: 0.1493127 (1000)	total: 15.8s	remaining: 7.86s
1100:	learn: 0

Default metric period is 5 because MSLE is/are not implemented for GPU
Metric MSLE is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.1727365	test: 0.1871644	best: 0.1871644 (0)	total: 20.1ms	remaining: 30.1s
100:	learn: 0.1566842	test: 0.1697571	best: 0.1697571 (100)	total: 1.66s	remaining: 23s
200:	learn: 0.1547102	test: 0.1685795	best: 0.1685795 (200)	total: 3.29s	remaining: 21.3s
300:	learn: 0.1530929	test: 0.1680756	best: 0.1680756 (300)	total: 4.94s	remaining: 19.7s
400:	learn: 0.1517884	test: 0.1678101	best: 0.1678101 (400)	total: 6.57s	remaining: 18s
500:	learn: 0.1507010	test: 0.1677525	best: 0.1677359 (446)	total: 8.14s	remaining: 16.2s
600:	learn: 0.1496581	test: 0.1676744	best: 0.1676393 (582)	total: 9.72s	remaining: 14.5s
700:	learn: 0.1486306	test: 0.1676675	best: 0.1676393 (582)	total: 11.3s	remaining: 12.9s
800:	learn: 0.1476759	test: 0.1676778	best: 0.1676393 (582)	total: 13s	remaining: 11.4s
900:	learn: 0.1467130	test: 0.1676736	best: 0.1676393 (582)	total: 14.7s	remaining: 9.75s
1000:	learn: 0.1459098	test: 0.1676204	best: 0.1676126 (997)	total: 16.3s	remaining: 8.15s
1100:	learn: 0.145

Default metric period is 5 because MSLE is/are not implemented for GPU
Metric MSLE is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.1745178	test: 0.1796718	best: 0.1796718 (0)	total: 22.2ms	remaining: 33.2s
100:	learn: 0.1580492	test: 0.1634667	best: 0.1634667 (100)	total: 1.75s	remaining: 24.2s
200:	learn: 0.1558155	test: 0.1622765	best: 0.1622746 (199)	total: 3.44s	remaining: 22.3s
300:	learn: 0.1542328	test: 0.1618887	best: 0.1618887 (300)	total: 5.16s	remaining: 20.6s
400:	learn: 0.1528890	test: 0.1617171	best: 0.1617029 (396)	total: 6.88s	remaining: 18.9s
500:	learn: 0.1516387	test: 0.1616411	best: 0.1616275 (493)	total: 8.59s	remaining: 17.1s
600:	learn: 0.1504646	test: 0.1615239	best: 0.1615239 (600)	total: 10.3s	remaining: 15.4s
700:	learn: 0.1493632	test: 0.1614487	best: 0.1614474 (693)	total: 12s	remaining: 13.6s
800:	learn: 0.1482986	test: 0.1613493	best: 0.1613493 (800)	total: 13.7s	remaining: 11.9s
900:	learn: 0.1472443	test: 0.1612319	best: 0.1612319 (900)	total: 15.3s	remaining: 10.2s
1000:	learn: 0.1463340	test: 0.1611519	best: 0.1611513 (960)	total: 17s	remaining: 8.48s
1100:	learn: 0.1

Default metric period is 5 because MSLE is/are not implemented for GPU
Metric MSLE is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.1769559	test: 0.1690746	best: 0.1690746 (0)	total: 19.7ms	remaining: 29.6s
100:	learn: 0.1591815	test: 0.1560811	best: 0.1560811 (100)	total: 1.74s	remaining: 24.1s
200:	learn: 0.1569284	test: 0.1553209	best: 0.1553209 (200)	total: 3.45s	remaining: 22.3s
300:	learn: 0.1551285	test: 0.1549338	best: 0.1549316 (297)	total: 5.15s	remaining: 20.5s
400:	learn: 0.1537626	test: 0.1548378	best: 0.1548291 (365)	total: 6.91s	remaining: 18.9s
500:	learn: 0.1525335	test: 0.1547689	best: 0.1547431 (484)	total: 8.71s	remaining: 17.4s
600:	learn: 0.1514667	test: 0.1547397	best: 0.1547303 (595)	total: 10.4s	remaining: 15.6s
700:	learn: 0.1504087	test: 0.1547078	best: 0.1546705 (676)	total: 12.2s	remaining: 13.9s
800:	learn: 0.1494829	test: 0.1547038	best: 0.1546607 (779)	total: 13.9s	remaining: 12.1s
900:	learn: 0.1485965	test: 0.1545772	best: 0.1545750 (897)	total: 15.5s	remaining: 10.3s
1000:	learn: 0.1478733	test: 0.1546036	best: 0.1545600 (976)	total: 17.2s	remaining: 8.55s
1100:	learn:

Default metric period is 5 because MSLE is/are not implemented for GPU
Metric MSLE is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.1755165	test: 0.1756901	best: 0.1756901 (0)	total: 19.9ms	remaining: 29.8s
100:	learn: 0.1585822	test: 0.1611586	best: 0.1611586 (100)	total: 1.75s	remaining: 24.3s
200:	learn: 0.1563801	test: 0.1601570	best: 0.1601570 (200)	total: 3.48s	remaining: 22.5s
300:	learn: 0.1548022	test: 0.1598543	best: 0.1598520 (299)	total: 5.2s	remaining: 20.7s
400:	learn: 0.1536210	test: 0.1597368	best: 0.1597324 (387)	total: 6.89s	remaining: 18.9s
500:	learn: 0.1525367	test: 0.1597078	best: 0.1596909 (461)	total: 8.58s	remaining: 17.1s
600:	learn: 0.1514305	test: 0.1596789	best: 0.1596700 (575)	total: 10.2s	remaining: 15.3s
700:	learn: 0.1505850	test: 0.1596596	best: 0.1596393 (662)	total: 11.9s	remaining: 13.6s
800:	learn: 0.1496335	test: 0.1596123	best: 0.1596045 (760)	total: 13.6s	remaining: 11.9s
900:	learn: 0.1488428	test: 0.1595952	best: 0.1595695 (872)	total: 15.4s	remaining: 10.2s
1000:	learn: 0.1479673	test: 0.1596104	best: 0.1595695 (872)	total: 17.1s	remaining: 8.51s
1100:	learn: 

In [31]:
scores = []
frames_for_metamodel = []
models = []

for i, (train_index, valid_index) in enumerate(strat_kfold.split(train_, train_['target'])):
    train_with_embs_churn = train_.copy()
    train_with_embs_churn['new_time'] = time_models[i].predict(train_[good_cols])
    good_cols_churn = good_cols + ['new_time']
    X, y = train_with_embs_churn.drop(['time',], axis=1), train_['target']
    X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_val = y.iloc[train_index], y.iloc[valid_index]

    model = CatBoostClassifier(
        iterations = 2500,
        depth=4,
        learning_rate=0.03,
        eval_metric='AUC',
        cat_features = cat_cols,
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(Pool(X_train[good_cols_churn], y_train, cat_features=cat_cols),
              eval_set=Pool(X_val[good_cols_churn], y_val, cat_features=cat_cols),
              verbose=100)
    models.append(model)
    
    pred = model.predict_proba(X_val[good_cols_churn])[:, 1]
    frames_for_metamodel.append(pd.DataFrame({'user_id': X_val.user_id.values, 'pred_time': pred}))
    scores.append(metrics.roc_auc_score(y_val, pred))

print(np.mean(scores))
metadata = pd.concat(frames_for_metamodel, axis=0).reset_index(drop=True)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6002763	best: 0.6002763 (0)	total: 31.2ms	remaining: 1m 17s
100:	test: 0.7511741	best: 0.7511741 (100)	total: 2.91s	remaining: 1m 9s
200:	test: 0.7558572	best: 0.7558921 (199)	total: 5.79s	remaining: 1m 6s
300:	test: 0.7577723	best: 0.7578166 (298)	total: 8.75s	remaining: 1m 3s
400:	test: 0.7584710	best: 0.7585066 (396)	total: 11.7s	remaining: 1m 1s
500:	test: 0.7598427	best: 0.7598427 (500)	total: 14.7s	remaining: 58.6s
600:	test: 0.7606794	best: 0.7606794 (600)	total: 17.7s	remaining: 56s
700:	test: 0.7609795	best: 0.7611614 (647)	total: 20.8s	remaining: 53.4s
800:	test: 0.7612635	best: 0.7613713 (795)	total: 23.8s	remaining: 50.4s
900:	test: 0.7619169	best: 0.7619228 (886)	total: 26.6s	remaining: 47.2s
1000:	test: 0.7621937	best: 0.7622318 (992)	total: 29.4s	remaining: 44s
1100:	test: 0.7622654	best: 0.7622821 (1096)	total: 32.3s	remaining: 41.1s
1200:	test: 0.7624866	best: 0.7625483 (1184)	total: 35.2s	remaining: 38.1s
1300:	test: 0.7627062	best: 0.7628767 (1268)	total: 

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6160697	best: 0.6160697 (0)	total: 29.9ms	remaining: 1m 14s
100:	test: 0.7634661	best: 0.7634661 (100)	total: 2.9s	remaining: 1m 8s
200:	test: 0.7685124	best: 0.7685414 (199)	total: 5.74s	remaining: 1m 5s
300:	test: 0.7703156	best: 0.7703156 (300)	total: 8.63s	remaining: 1m 3s
400:	test: 0.7704294	best: 0.7705578 (349)	total: 11.5s	remaining: 1m
500:	test: 0.7709096	best: 0.7709353 (477)	total: 14.3s	remaining: 57.2s
600:	test: 0.7706779	best: 0.7711200 (509)	total: 17.2s	remaining: 54.2s
700:	test: 0.7704222	best: 0.7711200 (509)	total: 20s	remaining: 51.3s
800:	test: 0.7703659	best: 0.7711200 (509)	total: 22.8s	remaining: 48.4s
900:	test: 0.7699825	best: 0.7711200 (509)	total: 25.8s	remaining: 45.8s
bestTest = 0.7711199522
bestIteration = 509
Shrink model to first 510 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5982572	best: 0.5982572 (0)	total: 30.3ms	remaining: 1m 15s
100:	test: 0.7541140	best: 0.7541518 (98)	total: 2.96s	remaining: 1m 10s
200:	test: 0.7618610	best: 0.7618610 (200)	total: 5.83s	remaining: 1m 6s
300:	test: 0.7644429	best: 0.7645193 (299)	total: 8.62s	remaining: 1m 2s
400:	test: 0.7646207	best: 0.7650103 (376)	total: 11.5s	remaining: 1m
500:	test: 0.7655461	best: 0.7655810 (490)	total: 14.5s	remaining: 57.7s
600:	test: 0.7655390	best: 0.7658744 (580)	total: 17.4s	remaining: 54.9s
700:	test: 0.7662399	best: 0.7662565 (698)	total: 20.5s	remaining: 52.6s
800:	test: 0.7664516	best: 0.7666271 (762)	total: 23.5s	remaining: 49.8s
900:	test: 0.7668462	best: 0.7668846 (898)	total: 26.4s	remaining: 46.8s
1000:	test: 0.7671015	best: 0.7671362 (920)	total: 29.3s	remaining: 43.8s
1100:	test: 0.7672448	best: 0.7673094 (1045)	total: 32.2s	remaining: 40.9s
1200:	test: 0.7674336	best: 0.7675496 (1182)	total: 35.1s	remaining: 38s
1300:	test: 0.7671384	best: 0.7675496 (1182)	total: 3

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5992469	best: 0.5992469 (0)	total: 24.6ms	remaining: 1m 1s
100:	test: 0.7265410	best: 0.7265410 (100)	total: 2.92s	remaining: 1m 9s
200:	test: 0.7317606	best: 0.7317606 (200)	total: 5.96s	remaining: 1m 8s
300:	test: 0.7345701	best: 0.7345793 (298)	total: 8.84s	remaining: 1m 4s
400:	test: 0.7362769	best: 0.7363341 (382)	total: 11.7s	remaining: 1m 1s
500:	test: 0.7374393	best: 0.7374393 (500)	total: 14.6s	remaining: 58.3s
600:	test: 0.7381889	best: 0.7382060 (596)	total: 17.3s	remaining: 54.8s
700:	test: 0.7391945	best: 0.7391945 (700)	total: 20.1s	remaining: 51.6s
800:	test: 0.7396207	best: 0.7396207 (800)	total: 23s	remaining: 48.8s
900:	test: 0.7399612	best: 0.7399871 (890)	total: 25.8s	remaining: 45.8s
1000:	test: 0.7401854	best: 0.7402571 (955)	total: 28.5s	remaining: 42.7s
1100:	test: 0.7405192	best: 0.7405564 (1094)	total: 31.2s	remaining: 39.7s
1200:	test: 0.7408019	best: 0.7408326 (1163)	total: 34s	remaining: 36.8s
1300:	test: 0.7408524	best: 0.7409928 (1280)	total: 3

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5908697	best: 0.5908697 (0)	total: 31ms	remaining: 1m 17s
100:	test: 0.7488512	best: 0.7488512 (100)	total: 2.96s	remaining: 1m 10s
200:	test: 0.7536393	best: 0.7536393 (200)	total: 5.81s	remaining: 1m 6s
300:	test: 0.7548836	best: 0.7548836 (300)	total: 8.73s	remaining: 1m 3s
400:	test: 0.7555639	best: 0.7555670 (399)	total: 11.6s	remaining: 1m
500:	test: 0.7557991	best: 0.7559799 (469)	total: 14.4s	remaining: 57.5s
600:	test: 0.7559902	best: 0.7563169 (569)	total: 17.2s	remaining: 54.3s
700:	test: 0.7560040	best: 0.7563169 (569)	total: 20.1s	remaining: 51.6s
800:	test: 0.7555795	best: 0.7563169 (569)	total: 22.9s	remaining: 48.6s
900:	test: 0.7558665	best: 0.7563169 (569)	total: 25.8s	remaining: 45.7s
bestTest = 0.7563169003
bestIteration = 569
Shrink model to first 570 iterations.
0.760144637683849


In [33]:
time =[time_models[i].predict(test_[good_cols]) for i in range(5)]
test_['new_time'] = np.mean(time, axis =0)
predict = np.zeros(len(test_))
for i in range(len(models)):
    predict += models[i].predict_proba(test_[good_cols_churn])[:, 1]
print(metrics.roc_auc_score(test_['target'], predict))

0.7748567409721092


In [34]:
test_pred = {'user_id': test_.user_id.values, 'pred_time': predict}

In [None]:
models = []

for i, (train_index, valid_index) in enumerate(strat_kfold.split(train_with_embs, train_with_embs['target'])):
    train_with_embs_churn = train_with_embs.copy()
    train_with_embs_churn['new_time'] = time_models[i].predict(train_with_embs[good_cols])
    good_cols_churn = good_cols + ['new_time']
    X, y = train_with_embs_churn.drop(['time',], axis=1), train_with_embs['target']
    X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_val = y.iloc[train_index], y.iloc[valid_index]

    model = CatBoostClassifier(
        iterations = 2500,
        depth=4,
        learning_rate=0.03,
        eval_metric='AUC',
        cat_features = cat_cols,
        early_stopping_rounds=400,
        task_type="GPU",
    )

    model.fit(Pool(X_train[good_cols_churn], y_train, cat_features=cat_cols),
              eval_set=Pool(X_val[good_cols_churn], y_val, cat_features=cat_cols),
              verbose=100)
    models.append(model)

In [None]:
time =[time_models[i].predict(test_with_embs[good_cols]) for i in range(5)]
test_with_embs['new_time'] = np.mean(time, axis =0)
predict = np.zeros(len(test_with_embs))
for i in range(len(models)):
    predict += models[i].predict_proba(test_with_embs[good_cols_churn])[:, 1]
print(metrics.roc_auc_score(test_with_embs['target'], predict))

In [None]:
real_pred = {'user_id': test_with_embs.user_id.values, 'pred_time': predict}

metadata = pd.concat([metadata, test_pred, real_pred])
metadata['pred_time'] = metadata['pred_time']/5
metadata.to_csv('time_pred_meta.csv', index=False)