In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import confusion_matrix

from src.utils import calculate_profit, train_val_test_split

## Construct the `profit` and `profitable` variables

In [2]:
funnel = pd.read_csv('train_data/funnel.csv')
funnel['profit'] = calculate_profit(funnel)
funnel['profitable'] = (funnel['profit'] > 0).astype(int)

train_funnel, val_funnel, test_funnel = train_val_test_split(funnel, (0.8, 0.1, 0.1))

## Construct balances

I'll try to get last balance, and then difference from last balance for last few months

In [3]:
balances = pd.read_csv('train_data/balance.csv')

In [4]:
# Sum up across all accounts
account_sums = balances.groupby(['client_id', 'month_end_dt'])[['avg_bal_sum_rur', 'max_bal_sum_rur', 'min_bal_sum_rur']].sum()

# Get average range (max - min) for all clients
account_sums['range'] = account_sums['max_bal_sum_rur'] - account_sums['min_bal_sum_rur']
avg_range = account_sums['range'].mean(level='client_id', skipna=True)

# Get the average amount in the last month
last_month_avg = account_sums.iloc[account_sums.index.get_level_values('month_end_dt') == '2019-08-31']
last_month_avg = last_month_avg['avg_bal_sum_rur']
last_month_avg.index = last_month_avg.index.droplevel('month_end_dt')

# Get differences from the average in the last month
account_sums['diff_last_month'] = account_sums['avg_bal_sum_rur'] - last_month_avg
diffs = account_sums['diff_last_month'].reset_index().query('month_end_dt != "2019-08-31"')
diffs = diffs.pivot(index='client_id', columns='month_end_dt')

# Put together all balance features
diffs.columns = [f'balance_diff_{c[1]}' for c in diffs.columns]
balance_ft = diffs
balance_ft['avg_range'] = avg_range
balance_ft['last_month_avg'] = last_month_avg

In [5]:
balance_ft.describe()

Unnamed: 0,balance_diff_2018-09-30,balance_diff_2018-10-31,balance_diff_2018-11-30,balance_diff_2018-12-31,balance_diff_2019-01-31,balance_diff_2019-02-28,balance_diff_2019-03-31,balance_diff_2019-04-30,balance_diff_2019-05-31,balance_diff_2019-06-30,balance_diff_2019-07-31,avg_range,last_month_avg
count,16455.0,16710.0,17056.0,17412.0,17633.0,17942.0,18271.0,18423.0,18582.0,18605.0,18618.0,18619.0,18619.0
mean,87.98821,51.67319,46.710483,33.441822,17.955481,10.769981,4.449948,8.166042,5.37913,6.46509,4.41852,52.00072,-90.814759
std,1884.751453,1181.022633,1157.721772,950.131726,553.243603,547.045252,498.944701,476.696726,352.540639,324.753785,262.595122,172.862765,934.295467
min,-38052.0,-25178.0,-24428.0,-23909.0,-23319.0,-22554.0,-21500.0,-21311.0,-21300.0,-21454.0,-20211.0,0.0,-31173.0
25%,-5.0,-7.0,-8.0,-8.0,-10.0,-11.0,-13.0,-10.0,-8.0,-5.0,-2.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.833333,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.833333,7.0
max,108444.0,77117.0,77565.0,76899.0,11144.0,13059.0,12013.0,22115.0,11144.0,11144.0,5795.0,7792.083333,38052.0


## Construct client dataset

In [6]:
client = pd.read_csv('train_data/client.csv')

In [7]:
# Take out citizenship and job_type, they are useless
client_ft = client.set_index('client_id')[['gender', 'age', 'region', 'city', 'education']]

# Filter out cities and region to only those above 200, make them categorical (not numerical)
region_counts = client_ft['region'].value_counts(dropna=False)
top_regions = region_counts[region_counts > 200].index

city_counts = client_ft['city'].value_counts(dropna=False)
top_cities = city_counts[city_counts > 200].index

client_ft.loc[~ client_ft['city'].isin(top_cities), 'city'] = None
client_ft.loc[~ client_ft['region'].isin(top_regions), 'region'] = None

client_ft['city'] = client_ft['city'].astype('str')
client_ft['region'] = client_ft['region'].astype('str')

client_ft[['gender', 'education']] = client_ft[['gender', 'education']].fillna('nan')

In [8]:
client_ft.describe(include='all')

Unnamed: 0,gender,age,region,city,education
count,21498,21495.0,21498.0,21498.0,21498.0
unique,3,,29.0,17.0,8.0
top,F,,,,
freq,11116,,4461.0,13862.0,12218.0
mean,,43.097139,,,
std,,10.798968,,,
min,,21.0,,,
25%,,34.0,,,
50%,,42.0,,,
75%,,52.0,,,


## Construct final dataset

In [9]:
LABEL_COLS = ['sale_flg', 'sale_amount', 'contacts', 'profit', 'profitable']
labels_all = funnel.set_index('client_id')[LABEL_COLS]
full_data = pd.concat([
    labels_all,
    balance_ft,
    client_ft
], axis=1)

## Split into train, val, test

In [10]:
train, val, test = train_val_test_split(full_data, [0.8, 0.1, 0.1])
CAT_FEATURES = ['gender', 'region', 'city', 'education']


train_pool = Pool(
    data = train.drop(LABEL_COLS, axis=1),
    label = train['profitable'],
    cat_features=CAT_FEATURES
)

val_pool = Pool(data = val.drop(LABEL_COLS, axis=1), cat_features=CAT_FEATURES)
test_pool = Pool(data = test.drop(LABEL_COLS, axis=1), cat_features=CAT_FEATURES)

In [11]:
train

Unnamed: 0_level_0,sale_flg,sale_amount,contacts,profit,profitable,gender,age,region,city,education
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-2488830281532074571,0,,2,-8000.00,0,F,65.0,15.0,,MIDDLE_PROFESSIONAL
7914417574231135424,0,,1,-4000.00,0,F,34.0,53.0,,
-7626311780150639240,0,,1,-4000.00,0,M,31.0,42.0,,
-3946603467009374110,0,,1,-4000.00,0,M,64.0,3.0,205.0,
3323016450605975186,0,,1,-4000.00,0,M,48.0,,,
...,...,...,...,...,...,...,...,...,...,...
2515539885248575892,0,,1,-4000.00,0,F,63.0,,,HIGHER_PROFESSIONAL
5572359620133684185,0,,1,-4000.00,0,M,37.0,28.0,,
-3363683058774926794,0,,3,-12000.00,0,M,30.0,25.0,33.0,
-2263929114370172357,1,20013.00,1,16013.00,1,F,45.0,,,HIGHER_PROFESSIONAL


## Construct model

In [12]:
model = CatBoostClassifier(
    iterations=1000,
    depth=10,
    loss_function='Logloss',
    custom_metric=['Accuracy'],
)

## Run training

In [13]:
model.fit(train_pool)

Learning rate set to 0.034712
0:	learn: 0.6239152	total: 62.4ms	remaining: 1m 2s
1:	learn: 0.5674945	total: 73.4ms	remaining: 36.6s
2:	learn: 0.5307583	total: 80.4ms	remaining: 26.7s
3:	learn: 0.4981069	total: 90.4ms	remaining: 22.5s
4:	learn: 0.4607091	total: 101ms	remaining: 20.2s
5:	learn: 0.4361164	total: 113ms	remaining: 18.7s
6:	learn: 0.4095739	total: 144ms	remaining: 20.5s
7:	learn: 0.3883541	total: 157ms	remaining: 19.5s
8:	learn: 0.3822626	total: 162ms	remaining: 17.8s
9:	learn: 0.3653959	total: 166ms	remaining: 16.5s
10:	learn: 0.3511528	total: 191ms	remaining: 17.2s
11:	learn: 0.3391617	total: 201ms	remaining: 16.6s
12:	learn: 0.3298603	total: 208ms	remaining: 15.8s
13:	learn: 0.3218783	total: 242ms	remaining: 17s
14:	learn: 0.3166560	total: 269ms	remaining: 17.7s
15:	learn: 0.3109013	total: 297ms	remaining: 18.3s
16:	learn: 0.3060087	total: 307ms	remaining: 17.7s
17:	learn: 0.3051283	total: 310ms	remaining: 16.9s
18:	learn: 0.3007845	total: 339ms	remaining: 17.5s
19:	learn

<catboost.core.CatBoostClassifier at 0x7fb3675627f0>

## Evaluate profitability of solution on val

In [14]:
pred_val = model.predict(val_pool)
bs_prof_val = val['profit'].mean()

selected_data_val = val[pred_val.astype(bool)]
model_prof_val = calculate_profit(selected_data_val).mean()

print(f'VAL | Baselin profit: {bs_prof_val:.2f}, model profit: {model_prof_val:.2f}')

VAL | Baselin profit: 3177.79, model profit: 22801.45


## Evaluate profitability of solution on test

In [15]:
pred_test = model.predict(test_pool)
bs_prof_test = test['profit'].mean()

selected_data_test = test[pred_test.astype(bool)]
model_prof_test = calculate_profit(selected_data_test).mean()

print(f'TEST | Baselin profit: {bs_prof_test:.2f}, model profit: {model_prof_test:.2f}')

TEST | Baselin profit: 2133.93, model profit: 19279.12


## Notes

1. Using just balance feature, I get 28876.93 on test
2. Using client features, I get on test 19279.12 on test
3. Using balance + client features, I get 26439.51 on test 🤔