In [17]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import confusion_matrix

from src.utils import calculate_profit, train_val_test_split

## Construct the `profit` and `profitable` variables

In [18]:
funnel = pd.read_csv('train_data/funnel.csv')
funnel['profit'] = calculate_profit(funnel)
funnel['profitable'] = (funnel['profit'] > 0).astype(int)

train_funnel, val_funnel, test_funnel = train_val_test_split(funnel, (0.8, 0.1, 0.1))

## Construct balances

I'll try to get last balance, and then difference from last balance for last few months

In [19]:
balances = pd.read_csv('train_data/balance.csv')

In [20]:
# Sum up across all accounts
account_sums = balances.groupby(['client_id', 'month_end_dt'])[['avg_bal_sum_rur', 'max_bal_sum_rur', 'min_bal_sum_rur']].sum()

# Get average range (max - min) for all clients
account_sums['range'] = account_sums['max_bal_sum_rur'] - account_sums['min_bal_sum_rur']
avg_range = account_sums['range'].mean(level='client_id', skipna=True)

# Get the average amount in the last month
last_month_avg = account_sums.iloc[account_sums.index.get_level_values('month_end_dt') == '2019-08-31']
last_month_avg = last_month_avg['avg_bal_sum_rur']
last_month_avg.index = last_month_avg.index.droplevel('month_end_dt')

# Get differences from the average in the last month
account_sums['diff_last_month'] = account_sums['avg_bal_sum_rur'] - last_month_avg
diffs = account_sums['diff_last_month'].reset_index().query('month_end_dt != "2019-08-31"')
diffs = diffs.pivot(index='client_id', columns='month_end_dt')

# Put together all balance features
diffs.columns = [f'balance_diff_{c[1]}' for c in diffs.columns]
balance_ft = diffs
balance_ft['avg_range'] = avg_range
balance_ft['last_month_avg'] = last_month_avg

In [21]:
balance_ft.describe()

Unnamed: 0,balance_diff_2018-09-30,balance_diff_2018-10-31,balance_diff_2018-11-30,balance_diff_2018-12-31,balance_diff_2019-01-31,balance_diff_2019-02-28,balance_diff_2019-03-31,balance_diff_2019-04-30,balance_diff_2019-05-31,balance_diff_2019-06-30,balance_diff_2019-07-31,avg_range,last_month_avg
count,16455.0,16710.0,17056.0,17412.0,17633.0,17942.0,18271.0,18423.0,18582.0,18605.0,18618.0,18619.0,18619.0
mean,87.98821,51.67319,46.710483,33.441822,17.955481,10.769981,4.449948,8.166042,5.37913,6.46509,4.41852,52.00072,-90.814759
std,1884.751453,1181.022633,1157.721772,950.131726,553.243603,547.045252,498.944701,476.696726,352.540639,324.753785,262.595122,172.862765,934.295467
min,-38052.0,-25178.0,-24428.0,-23909.0,-23319.0,-22554.0,-21500.0,-21311.0,-21300.0,-21454.0,-20211.0,0.0,-31173.0
25%,-5.0,-7.0,-8.0,-8.0,-10.0,-11.0,-13.0,-10.0,-8.0,-5.0,-2.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.833333,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.833333,7.0
max,108444.0,77117.0,77565.0,76899.0,11144.0,13059.0,12013.0,22115.0,11144.0,11144.0,5795.0,7792.083333,38052.0


## Construct final dataset

In [22]:
LABEL_COLS = ['sale_flg', 'sale_amount', 'contacts', 'profit', 'profitable']
labels_all = funnel.set_index('client_id')[LABEL_COLS]
full_data = pd.concat([labels_all, balance_ft], axis=1)

In [23]:
pool = Pool(
    data = full_data.drop(LABEL_COLS, axis=1),
    label = full_data['profitable']
)

## Construct model

In [24]:
model = CatBoostClassifier(
    iterations=1000,
    depth=10,
    loss_function='Logloss',
    custom_metric=['Accuracy'],
)

## Run training

In [25]:
model.fit(pool)

Learning rate set to 0.038183
0:	learn: 0.6627924	total: 86.6ms	remaining: 1m 26s
1:	learn: 0.6334424	total: 116ms	remaining: 57.8s
2:	learn: 0.6075997	total: 144ms	remaining: 47.9s
3:	learn: 0.5840026	total: 172ms	remaining: 42.7s
4:	learn: 0.5624774	total: 198ms	remaining: 39.4s
5:	learn: 0.5428611	total: 232ms	remaining: 38.4s
6:	learn: 0.5257402	total: 265ms	remaining: 37.6s
7:	learn: 0.5100365	total: 295ms	remaining: 36.6s
8:	learn: 0.4958940	total: 323ms	remaining: 35.6s
9:	learn: 0.4829738	total: 351ms	remaining: 34.8s
10:	learn: 0.4720082	total: 379ms	remaining: 34s
11:	learn: 0.4610312	total: 407ms	remaining: 33.5s
12:	learn: 0.4518615	total: 439ms	remaining: 33.4s
13:	learn: 0.4436490	total: 468ms	remaining: 33s
14:	learn: 0.4357691	total: 497ms	remaining: 32.6s
15:	learn: 0.4287402	total: 526ms	remaining: 32.4s
16:	learn: 0.4225362	total: 555ms	remaining: 32.1s
17:	learn: 0.4158764	total: 582ms	remaining: 31.8s
18:	learn: 0.4101609	total: 611ms	remaining: 31.6s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7f9a3956adf0>

In [26]:
l = model.predict(pool)

## Evaluate profitability of solution

In [30]:
selected_data = full_data[l.astype(bool)]
calculate_profit(selected_data).mean()

46758.19825606125

In [31]:
full_data['profit'].mean()

2586.1902456042426