In [None]:
import os
os.environ['NUMEXPR_MAX_THREADS'] = '15'
os.environ['NUMEXPR_NUM_THREADS'] = '12'
os.environ["CUDA_VISIBLE_DEVICES"]='0'

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import random
import warnings
import copy
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.simplefilter('ignore')

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

random.seed(42)
np.random.seed(42)

Объединим всех юзеров в 1 датафрейм, для создания фичей

In [None]:
main = pd.read_csv('../data/train.csv')

sample = pd.read_csv('../data/sample_submit_naive.csv').drop('predict', axis=1)
sample['target'] = -1

main = pd.concat([main, sample])

Откроем дополнительные файлы

In [None]:
clients = pd.read_csv('../data/clients.csv')
report_dates = pd.read_csv('../data/report_dates.csv', parse_dates=['report_dt'])

transactions = pd.read_csv('../data/transactions.csv', parse_dates=['transaction_dttm'])
transactions = transactions.sort_values('transaction_dttm').reset_index(drop=True)

In [None]:
df = copy.deepcopy(transactions)
df['mcc_code'] = df['mcc_code'].astype(str)
user_docs = df.groupby('user_id')['mcc_code'].apply(' '.join).reset_index()
vectorizer = TfidfVectorizer(token_pattern=r'\b\d+\b')
tfidf_matrix = vectorizer.fit_transform(user_docs['mcc_code'])
tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=user_docs['user_id'])

In [None]:
# Добавим информацию о клиенте, а также закодируем employee_count_nm

main = main.merge(clients, how='left', on='user_id')
main['employee_count_nm'] = LabelEncoder().fit_transform(main['employee_count_nm'].fillna('unknown'))

In [None]:
main = main.merge(tfidf, how='left', on='user_id')

Для категорий, которые встречались больше 10 раз агрегируем информацию о транзакциях по пользователю и коду MCC

In [None]:
good_codes = transactions['mcc_code'].value_counts()
good_codes = good_codes[good_codes >= 10]

mcc_info = transactions[transactions.mcc_code.isin(good_codes)].pivot_table(
    index = 'user_id',
    values=['transaction_amt'],
    columns=['mcc_code'],
    aggfunc=['count', 'median', 'sum']
).fillna(0)
mcc_info.columns = ['main_' + '_'.join(map(str, x)) for x in mcc_info.columns]

count_cols = [x for x in mcc_info.columns if 'count' in x]
mcc_info['sum'] = mcc_info[count_cols].sum(axis=1)
for col in count_cols:
    mcc_info[f'{col}_norm'] = mcc_info[col] / mcc_info['sum']
mcc_info.drop('sum', axis=1, inplace=True)

main = main.merge(mcc_info, how='left', left_on='user_id', right_index=True)

Сбор информации о транзакциях в каждой валюте за последние 30 дней, а также за весь промежуток

In [None]:
df_more = transactions.merge(clients[['user_id', 'report']], how='left', on='user_id')
df_more = df_more.merge(report_dates, how='left', on='report')
df_more['days_to_report'] = (df_more['report_dt'] - df_more['transaction_dttm']).dt.days


for day_diff in [30, 1000]:

    # Информация о размерах транзакций в различных валютах
    currency_pivot = df_more[df_more['days_to_report'] < day_diff + 100].pivot_table(
        index='user_id',
        columns='currency_rk',
        values='transaction_amt',
        aggfunc=['sum', 'mean', 'median', 'count']
    ).fillna(0)
    currency_pivot.columns = [f'currency_daydiff_{day_diff}_{x[0]}_{x[1]}' for x in currency_pivot.columns]

    currency_pivot['sum'] = currency_pivot[[x for x in currency_pivot.columns if 'count' in x]].sum(axis=1)
    for x in range(4):
        currency_pivot[f'currency_daydiff_{day_diff}_count_{x}_norm'] = currency_pivot[f'currency_daydiff_{day_diff}_count_{x}'] / currency_pivot['sum']
    currency_pivot.drop('sum', axis=1, inplace=True)

    main = main.merge(currency_pivot, how='left', left_on='user_id', right_index=True)


    general_trans_info = df_more[df_more['days_to_report'] < day_diff + 100].groupby('user_id')['transaction_amt'].agg(['sum', 'count', 'median'])
    general_trans_info[['sum', 'count']] = general_trans_info[['sum', 'count']].fillna(0)
    general_trans_info.columns = [f'general_trans_info_{day_diff}_{x}' for x in general_trans_info]
    main = main.merge(general_trans_info, how='left', left_on='user_id', right_index=True)

    general_trans_info = df_more[(df_more['days_to_report']<day_diff + 100)&(df_more['transaction_amt']>0)].groupby('user_id')['transaction_amt'].agg(['sum', 'count', 'median'])
    general_trans_info[['sum', 'count']] = general_trans_info[['sum', 'count']].fillna(0)
    general_trans_info.columns = [f'positive_general_trans_info_{day_diff}_{x}' for x in general_trans_info]
    main = main.merge(general_trans_info, how='left', left_on='user_id', right_index=True)

    general_trans_info = df_more[(df_more['days_to_report']<day_diff + 100)&(df_more['transaction_amt']<0)].groupby('user_id')['transaction_amt'].agg(['sum', 'count', 'median'])
    general_trans_info[['sum', 'count']] = general_trans_info[['sum', 'count']].fillna(0)
    general_trans_info.columns = [f'negative_general_trans_info_{day_diff}_{x}' for x in general_trans_info]
    main = main.merge(general_trans_info, how='left', left_on='user_id', right_index=True)


# Анализируем кол-во транзакций в последние n дней / кол-во транзакций до последних n дней
for x in [5, 30]:
    prev = df_more[df_more['days_to_report'] > x + 100].groupby('user_id')['report'].agg(['count']).reset_index().rename({'count': f'num_transaction_before_{x}_days'}, axis=1)
    last = df_more[df_more['days_to_report'] <= x + 100].groupby('user_id')['report'].agg(['count']).reset_index().rename({'count': f'num_transaction_last_{x}_days'}, axis=1)

    main = main.merge(prev, how='left', on='user_id')
    main = main.merge(last, how='left', on='user_id')
    main[f'num_transaction_last_{x}_days'].fillna(0, inplace=True)
    main[f'num_transaction_before_{x}_days'].fillna(0, inplace=True)
    main[f'percent_last_{x}'] = main[f'num_transaction_last_{x}_days'] / main[f'num_transaction_before_{x}_days']

In [None]:
has_tr = df_more.loc[df_more['transaction_amt'] < 0].groupby('user_id', as_index=False).count()

In [None]:
# Кол-во уникальных MCC кодов, валют, а также уникальных дней, в которые были транзакции
main = main.merge(df_more.groupby('user_id')['days_to_report'].nunique(), how='left', on='user_id').rename({'days_to_report': 'nunique_days'}, axis=1)
main = main.merge(df_more.groupby('user_id')['mcc_code'].nunique(), how='left', on='user_id').rename({'mcc_code': 'nunique_mcc_codes'}, axis=1)
main = main.merge(df_more.groupby('user_id')['currency_rk'].nunique(), how='left', on='user_id').rename({'currency_rk': 'nunique_currency'}, axis=1)

main = main.fillna(0)

Информация о количестве и размере транзакций в разрезе часов.

In [None]:
tmp = transactions.copy()
tmp['hour'] = tmp['transaction_dttm'].dt.hour
pivot_table = tmp.pivot_table(
    index='user_id',
    columns='hour',
    values='transaction_amt',
    aggfunc=['count', 'median']
).fillna(0)
pivot_table.columns = [f'hour_{x[0]}_{x[1]}' for x in pivot_table.columns]

count_cols = [x for x in pivot_table.columns if 'count' in x]
pivot_table['sum'] = pivot_table[count_cols].sum(axis=1)
for col in count_cols:
    pivot_table[f'{col}_norm'] = pivot_table[col] / pivot_table['sum']
pivot_table.drop('sum', axis=1, inplace=True)

main = main.merge(pivot_table, how='left', left_on='user_id', right_index=True)

Фичи, основанные на временных отрезках

In [None]:
cur = transactions.groupby('user_id')['transaction_dttm'].agg(['min', 'max']).reset_index()
cur = cur.merge(clients[['user_id', 'report']], how='left', on='user_id')
cur = cur.merge(report_dates, how='left', on='report')

cur['min_diff_dttm'] = (cur['report_dt'] - cur['min']).dt.days
cur['days_to_report'] = (cur['report_dt'] - cur['max']).dt.days
cur['max_min_diff_dttm'] = cur['days_to_report'] - cur['min_diff_dttm']

main = main.merge(cur[['user_id', 'min_diff_dttm','days_to_report','max_min_diff_dttm']], how='left', on='user_id')

In [None]:
main['trx_density'] = main['max_min_diff_dttm'] / main['general_trans_info_1000_count']
main['days_density'] = (main['max_min_diff_dttm'] + 1) / main['nunique_days']

In [None]:
main.head()

Подготовка данных к обучение модели

In [None]:
cat_cols = ['employee_count_nm', 'report']
main[cat_cols] = main[cat_cols].astype(str)

main = main.sort_values('user_id').reset_index(drop=True)
train1 = main[main.target != -1]
test1 = main[main.target == -1]

In [None]:
embeddings_path = '../embeddings/wtte_coles_trx.csv'
main_embs = pd.read_csv(embeddings_path)

In [None]:
train1 = train1.merge(main_embs, on='user_id')
test1 = test1.merge(main_embs, on='user_id')

In [None]:
test_ids = pd.read_csv('../data/test_ids.csv')
train, test = train1.loc[~train1['user_id'].isin(test_ids['user_id'])], train1.loc[train1['user_id'].isin(test_ids['user_id'])]

Отбор фичей вместе с отбором компонент эмбеддингов

In [None]:
model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,
    task_type="GPU",
    eval_metric='AUC',
    cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(train.drop(['user_id', 'target', 'time'], axis=1), train['target'], verbose=100)


df_imp = pd.DataFrame({
    'name': train.drop(['user_id', 'target', 'time'], axis=1).columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)
# display(df_imp) # Можно посмотреть на предварительный feature_importance()

df_imp = df_imp[df_imp['imp'] > 0.15] # Берем все фичи, у которых важность больше 0.3

# Добавляем статистические фичи, их нельзя было использовать для тренировки здесь, т.к. получился бы лик в данных
good_cols = df_imp['name'].tolist()

In [None]:
df_imp

Обучение основных моделей на 5 Фолдах. Стратификация по report, возможно следует попробовать что нибудь другое:)

In [None]:
strat_kfold = StratifiedKFold(n_splits=5)

X, y = train.drop(['time', 'target'], axis=1), train['target']
scores = []
frames_for_metamodel = []
models = []
for train_index, valid_index in strat_kfold.split(train, train['target']):
    
    X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
    
    model = CatBoostClassifier(
        iterations = 15000,
        depth=4,
        learning_rate=0.03,
        eval_metric='AUC',
        task_type="GPU",
        cat_features = cat_cols,
        early_stopping_rounds=400,
        random_seed=42
    )

    model.fit(Pool(X_train[good_cols], y_train, cat_features=cat_cols),
              eval_set=Pool(X_val[good_cols], y_val, cat_features=cat_cols),
              verbose=100)
    models.append(model)
    
    pred = model.predict_proba(X_val[good_cols])[:, 1]
    scores.append(metrics.roc_auc_score(y_val, pred))
    frames_for_metamodel.append(pd.DataFrame({'user_id': X_val.user_id.values, 'pred_agg_trx': pred}))
    scores.append(metrics.roc_auc_score(y_val, pred))

print(np.mean(scores))
metadata = pd.concat(frames_for_metamodel, axis=0).reset_index(drop=True)

In [None]:
predict = np.zeros(len(test))
for i in range(len(models)):
    predict += models[i].predict_proba(test[good_cols])[:, 1]
test_pred = {'user_id': test.user_id.values, 'pred_agg_trx': predict/5}
test_pred = pd.DataFrame(test_pred)
print(metrics.roc_auc_score(test['target'], predict))

In [None]:
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X, y = train1.drop(['time', 'target'], axis=1), train1['target']
scores = []

models = []
for train_index, valid_index in strat_kfold.split(train1, train1['target']):
    
    X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
    
    model = CatBoostClassifier(
        iterations = 15000,
        depth=4,
        learning_rate=0.03,
        eval_metric='AUC',
        task_type="GPU",
        cat_features = cat_cols,
        early_stopping_rounds=400,
        random_seed=42,
    )

    model.fit(Pool(X_train[good_cols], y_train, cat_features=cat_cols),
              eval_set=Pool(X_val[good_cols], y_val, cat_features=cat_cols),
              verbose=100)
    models.append(model)
    
    pred = model.predict_proba(X_val[good_cols])[:, 1]
    scores.append(metrics.roc_auc_score(y_val, pred))

print(np.mean(scores))

In [None]:
predict = np.zeros(len(test1))
for i in range(len(models)):
    predict += models[i].predict_proba(test1[good_cols])[:, 1]
real_pred = {'user_id': test1.user_id.values, 'pred_agg_trx': predict/5}
real_pred = pd.DataFrame(real_pred)

In [None]:
metadata = pd.concat([metadata, test_pred, real_pred])
metadata['pred_agg_trx'] = metadata['pred_agg_trx']
metadata.to_csv('../predictions/agg_trx_td-idf.csv', index=False)