<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data" data-toc-modified-id="Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data</a></span></li><li><span><a href="#EDA" data-toc-modified-id="EDA-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>EDA</a></span></li><li><span><a href="#Feature-engineering" data-toc-modified-id="Feature-engineering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Feature engineering</a></span></li><li><span><a href="#Models" data-toc-modified-id="Models-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Models</a></span><ul class="toc-item"><li><span><a href="#Log-reg" data-toc-modified-id="Log-reg-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Log reg</a></span></li><li><span><a href="#Catboost" data-toc-modified-id="Catboost-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Catboost</a></span></li></ul></li><li><span><a href="#Clusterization" data-toc-modified-id="Clusterization-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Clusterization</a></span><ul class="toc-item"><li><span><a href="#UMAP" data-toc-modified-id="UMAP-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>UMAP</a></span></li><li><span><a href="#K-Means" data-toc-modified-id="K-Means-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>K-Means</a></span></li></ul></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

sns.set_style("whitegrid", {'axes.grid' : False})

# from tqdm.notebook import tqdm
from tqdm.auto import tqdm

## Data

In [2]:
df = pd.read_csv('data/transactions_train.csv')

In [3]:
df = df.sort_values(['client_id', 'trans_date'])

In [4]:
bins = pd.read_csv('data/train_target.csv')

In [5]:
df = pd.merge(df, bins, on='client_id')

In [6]:
print('before downsampling:\n', df.dtypes)
df['client_id'] = pd.to_numeric(df['client_id'], downcast='unsigned')
df['trans_date'] = pd.to_numeric(df['trans_date'], downcast='unsigned')
df['small_group'] = pd.to_numeric(df['small_group'], downcast='signed')
df['amount_rur'] = pd.to_numeric(df['amount_rur'], downcast='float')

print('after downsampling:\n',df.dtypes)

before downsampling:
 client_id        int64
trans_date       int64
small_group      int64
amount_rur     float64
bins             int64
dtype: object
after downsampling:
 client_id       uint16
trans_date      uint16
small_group      int16
amount_rur     float32
bins             int64
dtype: object


In [7]:
groups = pd.read_csv('data/small_group_description.csv')

In [8]:
df['weekday'] = df['trans_date'] % 7

## EDA

In [None]:
np.random.seed(228)

In [None]:
sns.boxplot(x=df['amount_rur'].sample(1000000))
plt.xscale('log')

In [None]:
sns.boxplot(x=df[df['small_group'] == 1]['amount_rur'])
plt.xscale('log')
plt.title('Supermarkets transaction amount')

In [9]:
tmp = df.groupby('small_group')[['amount_rur']].agg(['mean', 'count', 'max', 'min']).reset_index()
tmp.columns =['small_group_code', 'mean', 'count', 'max', 'min']
tmp = tmp.set_index('small_group_code')
tmp = tmp.join(groups, on='small_group_code', how='outer').reset_index(drop=True)
tmp = tmp.sort_values('count', ascending=False)
tmp

Unnamed: 0,mean,count,max,min,small_group,small_group_code
1,41.666897,8131374.0,22447.980469,0.002000,Сетевые супермаркеты и продуктовые магазины,1
11,21.339022,3202881.0,11515.068359,0.001000,Фастфуд и кафе,11
3,19.819042,2428984.0,24261.560547,0.001000,Оплата телефона и связи,3
15,21.617123,1427077.0,13318.199219,0.005000,Несетевые супермаркеты и продуктовые магазины,15
4,34.616478,1207604.0,38081.308594,0.001000,Аптеки,4
...,...,...,...,...,...,...
198,45.794998,1.0,45.794998,45.794998,Услуги починки (металлические изделия),199
199,126.484001,1.0,126.484001,126.484001,Телеграф,200
201,68.968002,1.0,68.968002,68.968002,Бетонные работы,203
202,,,,,Снегоходы,194


In [None]:
mask = df['small_group'] == 1
sns.boxplot(x=df[mask]['amount_rur'])
plt.xscale('log')

In [None]:
fig, axs = plt.subplots(figsize=(20, 4))

sns.barplot(x=list(range(len(tmp))), y=tmp['count'].sort_values(ascending=False));

In [None]:
quantiles = np.arange(0.1, 1.1, 0.1)
counts = [int(tmp['count'].quantile(i)) for i in quantiles]
quantiles = pd.DataFrame({'quantile': quantiles, 'count': counts})
quantiles

In [None]:
tmp = tmp.sort_values('mean', ascending=False)
tmp

In [None]:
tmp = tmp.sort_values('mean', ascending=False)
tmp[:30]

In [None]:
tmp = tmp.sort_values('max', ascending=False)
tmp

In [None]:
tmp = tmp.sort_values('min', ascending=False)
tmp

**drop transactions from categories with low number of samples**

keep transactions from categories > 0.7 quantile

In [10]:
df = df[df['small_group'].isin(set(tmp[tmp['count'] > 23829]['small_group_code']))]

## Feature engineering

basically, RFM

In [11]:
import gc

count of small group transactions

In [12]:
tmp = df.groupby(['client_id', 'small_group'])[['amount_rur']].count().reset_index()
tmp = tmp.pivot(index='client_id', columns='small_group', values='amount_rur')
tmp = tmp.fillna(0)

In [13]:
tmp.columns = [f'group{c}_cnt' for c in tmp.columns]

In [14]:
client_df = tmp

https://stackoverflow.com/questions/14507794/pandas-how-to-flatten-a-hierarchical-index-in-columns

count of transactions by weekday

In [15]:
tmp = df.groupby(['client_id', 'weekday'])[['amount_rur']].count().reset_index()
tmp = tmp.pivot(index='client_id', columns='weekday', values='amount_rur')
tmp = tmp.fillna(0)

In [16]:
tmp.columns = [f'weekday{c}_cnt' for c in tmp.columns]

In [17]:
client_df = client_df.join(tmp)

count of small group transactions by weekday

In [18]:
tmp = df.groupby(['client_id', 'weekday', 'small_group'])[['amount_rur']].count().reset_index()
tmp = tmp.pivot(index='client_id', columns=['small_group', 'weekday'], values='amount_rur')
tmp = tmp.fillna(0)

In [19]:
tmp.columns = [f'group{c1}_weekday{c2}_cnt' for c1, c2 in tmp.columns.values]

In [20]:
client_df = client_df.join(tmp)

mean by small group

In [21]:
tmp = df.groupby(['client_id', 'small_group'])[['amount_rur']].mean().reset_index()
tmp = tmp.pivot(index='client_id', columns='small_group', values='amount_rur')
tmp = tmp.fillna(0)

In [22]:
tmp.columns = [f'group{i}_mean' for i in tmp.columns]

In [23]:
client_df = client_df.join(tmp)

mean amount by weekday

In [24]:
tmp = df.groupby(['client_id', 'weekday'])[['amount_rur']].mean().reset_index()
tmp = tmp.pivot(index='client_id', columns='weekday', values='amount_rur')
tmp = tmp.fillna(0)

In [25]:
tmp.columns = [f'weekday{i}_mean' for i in tmp.columns]

In [26]:
client_df = client_df.join(tmp)

mean amount by small group and weekday

In [27]:
tmp = df.groupby(['client_id', 'small_group', 'weekday'])[['amount_rur']].mean().reset_index()
tmp = tmp.pivot(index='client_id', columns=['small_group', 'weekday'], values='amount_rur')
tmp = tmp.fillna(0)

In [28]:
tmp.columns = [f'group{c1}_weekday{c2}_mean' for c1, c2 in tmp.columns]

In [29]:
client_df = client_df.join(tmp)

max/min in each category

In [30]:
tmp = df.groupby(['client_id', 'small_group'])[['amount_rur']].max().reset_index()
tmp = tmp.pivot(index='client_id', columns='small_group', values='amount_rur')
tmp = tmp.fillna(0)

In [31]:
tmp.columns = [f'group{i}_max' for i in tmp.columns]

In [32]:
client_df = client_df.join(tmp)

In [33]:
tmp = df.groupby(['client_id', 'small_group'])[['amount_rur']].min().reset_index()
tmp = tmp.pivot(index='client_id', columns='small_group', values='amount_rur')
tmp = tmp.fillna(0)

In [34]:
tmp.columns = [f'group{i}_min' for i in tmp.columns]

In [35]:
client_df = client_df.join(tmp)

target

In [36]:
client_df = client_df.join(bins.set_index('client_id'))

In [38]:
client_df.to_csv('data/client_df.csv')

## Models

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [40]:
X = client_df.drop('bins', axis=1)
y = client_df.reset_index()['bins']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, shuffle=False, random_state=228)

### Log reg

In [None]:
logit = LogisticRegression(multi_class='ovr', random_state=228, solver='liblinear')
logit.fit(X_train, y_train)

In [None]:
logit_train_pred = logit.predict_proba(X_train)
logit_valid_pred = logit.predict_proba(X_valid)

logit_train_pred = logit.predict(X_train)
logit_valid_pred = logit.predict(X_valid)

In [None]:
print(metrics.classification_report(y_valid, logit.predict(X_valid)))

In [None]:
cm=metrics.confusion_matrix(y_valid, logit_valid_pred )
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# for i, r in enumerate(cm_norm):
#     r[i] = 0

In [None]:
sns.heatmap(cm_norm, annot=True, cmap='magma')

### Catboost

In [None]:
train_pool = Pool(
    X_train, y_train,
)
val_pool = Pool(
    X_valid, y_valid,
)

In [None]:
params = {
    #'loss_function': 'Logloss',
    'loss_function':'MultiClass',
    'custom_metric': ['F1', 'Accuracy'],
    'iterations': 400,
    'random_seed': 228,
    'auto_class_weights': 'Balanced',
    #'early_stopping_rounds': 100,
    'use_best_model': True,
}

In [None]:
model = CatBoostClassifier(
    **params,
    depth=3,
    verbose=True,
    l2_leaf_reg=3,
    learning_rate=0.1
)

In [None]:
model.fit(
    train_pool,
    eval_set=val_pool,
    verbose=False, plot=True,
);

In [None]:
importances = [(col, imp) for (col, imp) in zip(X_train, model.get_feature_importance(train_pool))]
importances = pd.DataFrame(importances, columns=['column', 'importance'])
cb_imp = importances.copy()
importances = importances.sort_values('importance', ascending=False)

In [None]:
fig, axs = plt.subplots(figsize=(4, 6))

sns.barplot(x='importance',
            y='column',
            orient='h',
            color='green',
            data=importances.head(30))

In [None]:
y_pred = model.predict(X_valid)[:, 0]

In [None]:
print(metrics.classification_report(y_valid, y_pred))

In [None]:
cm=metrics.confusion_matrix(y_valid, y_pred)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# for i, r in enumerate(cm_norm):
#     r[i] = 0

In [None]:
sns.heatmap(cm_norm, annot=True, cmap='magma')

## Clusterization

Кластеризация пользователей на основе handmade признаков обоснование выбора признаков с использованием EDA. Кластеризация пользователей любым способом, который кажется пригодным(с учетом особенностей данных). Визуализация категорий пользователей с использованием алгоритмов понижения размерности данных.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X = client_df.drop('bins', axis=1)

### UMAP

In [None]:
from umap import UMAP

In [None]:
X_scaled = StandardScaler().fit_transform(X)

In [None]:
umap = UMAP(
    n_components=2,
    init='random',
    min_dist=0.5,
    random_state=228)

In [None]:
# takes 20 seconds on my pc
X_umap = umap.fit_transform(X_scaled)

In [None]:
df_to_plot = pd.DataFrame(X_umap, columns=['x', 'y'])
df_to_plot['bins'] = client_df['bins'].astype(str)

In [None]:
fig, axs = plt.subplots(figsize=(10, 8))

sns.scatterplot(
    x='x', y='y', hue='bins',
    data=df_to_plot,
    alpha=0.3)

plt.title("UMAP embedding of the clients")

In [None]:
df_to_plot_umap = df_to_plot.copy()

### K-Means

In [None]:
from sklearn.cluster import KMeans

In [None]:
# 1 min 36 sec
inertia = []
ks = range(2, 10, 3)
for k in ks:
    print(k)
    kmeans = KMeans(n_clusters=k, random_state=1).fit(X_scaled)
    inertia.append(np.sqrt(kmeans.inertia_))

In [None]:
plt.plot(ks, inertia, marker='s');
plt.xlabel('$k$')
plt.ylabel('$J(C_k)$');

4 clusters

In [None]:
kmeans = KMeans(n_clusters=4, random_state=1).fit(X_scaled)

In [None]:
pd.Series(kmeans.labels_).value_counts()

In [None]:
# take umap
df_to_plot = pd.DataFrame(X_umap, columns=['x', 'y'])
df_to_plot['bins'] = client_df['bins'].astype(str)
df_to_plot['k-label'] = kmeans.labels_
df_to_plot['k-label'] = df_to_plot['k-label'].astype(str)

In [None]:
df_to_plot_not_scaled = df_to_plot.copy()

In [None]:
fig, axs = plt.subplots(figsize=(16, 8))

plt.subplot(1, 2, 1)

sns.scatterplot(
    x='x', y='y', hue='k-label',
    data=df_to_plot,
    alpha=0.2)
plt.title('UMAP representations of K-Means clusters')

plt.subplot(1, 2, 2)

plt.title('UMAP representations of age groups')

sns.scatterplot(
    x='x', y='y', hue='bins',
    data=df_to_plot_umap,
    alpha=0.2)

fig.tight_layout()

clusters are tight unlike age bins