# Colab setup

In [1]:
if 'google.colab' in str(get_ipython()):
    !pip install pytorch-lifestream

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Data load

In [2]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/datasouls-competitions/sirius/data.zip
    ! unzip -j -o data.zip '*.csv' -d data
    ! mv data.zip data/

## Setup

In [3]:
%load_ext autoreload
%autoreload 2

# import logging
import torch
import pytorch_lightning as pl
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [4]:
%ls -la data/

total 1059284
drwxr-xr-x 2 root root      4096 Mar 23 12:49 [0m[01;34m.[0m/
drwxr-xr-x 1 root root      4096 Mar 23 14:07 [01;34m..[0m/
-rw-r--r-- 1 root root 251394334 Mar 23 12:49 data.zip
-rw-r--r-- 1 root root       212 Oct 30  2019 ._small_group_description.csv
-rwxr-xr-x 1 root root     10694 Oct 30  2019 [01;32msmall_group_description.csv[0m*
-rw-r--r-- 1 root root       212 Oct 30  2019 ._test.csv
-rwxr-xr-x 1 root root    115609 Oct 30  2019 [01;32mtest.csv[0m*
-rw-r--r-- 1 root root       212 Oct 30  2019 ._train_target.csv
-rwxr-xr-x 1 root root    233306 Oct 30  2019 [01;32mtrain_target.csv[0m*
-rw-r--r-- 1 root root       212 Oct 30  2019 ._transactions_test.csv
-rwxr-xr-x 1 root root 333549487 Oct 30  2019 [01;32mtransactions_test.csv[0m*
-rw-r--r-- 1 root root       212 Oct 30  2019 ._transactions_train.csv
-rwxr-xr-x 1 root root 499354971 Oct 30  2019 [01;32mtransactions_train.csv[0m*


## Data preproccessing

In [5]:
import os
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'transactions_train.csv'))
source_data.head(2)

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017


In [6]:
initial_test = pd.read_csv(os.path.join(data_path, 'transactions_test.csv'))
initial_test.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,46445,3,0,19.555
1,46445,3,1,27.774
2,46445,4,0,18.114
3,46445,4,1,22.183
4,46445,5,2,45.795


In [7]:
whole_dataset = pd.concat([source_data, initial_test])

In [8]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time='trans_date',
    event_time_transformation='none',
    cols_category=['small_group'],
    cols_numerical=['amount_rur'],
    return_records=True,
)

In [9]:
%%time

whole_dataset = preprocessor.fit_transform(whole_dataset.sample(frac=0.5, random_state=42))

CPU times: user 1min 13s, sys: 7.3 s, total: 1min 20s
Wall time: 1min 24s


In [10]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [11]:
source_data = preprocessor.fit_transform(source_data)

In [12]:
dataset = sorted(source_data, key=lambda x: x['client_id'])

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(24000, 6000)

In [14]:
train[0].keys()

dict_keys(['client_id', 'trans_date', 'event_time', 'small_group', 'amount_rur'])

## Embedding training

Model training in our framework organised via pytorch-lightning (pl) framework.
The key parts of neural networks training in pl are: 

    * model (`pytorch_lightning.LightningModule`)
    * data loader (`torch.utils.data.DataLoader`)
    * trainer (`pytorch_lightning.Trainer`)
    
For futher details check https://pytorchlightning.ai/

### Model definition

In [15]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount_rur': 'identity'},
    embeddings={
        'trans_date': {'in': 800, 'out': 16},
        'small_group': {'in': 250, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

### Data loader

In [16]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=whole_dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

### Trainer

In [17]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


### Training 

In [18]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")


logger.version = 0


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 240 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.962     Total estimated model params size (MB)


{'loss': tensor(93.3641), 'seq_len': tensor(111.9600)}
CPU times: user 11min 2s, sys: 51.5 s, total: 11min 54s
Wall time: 15min 48s


### Save sequence encoder for other experiments

In [None]:
torch.save(seq_encoder.state_dict(), "coles-emb.pt")

## Inference 

In [19]:
# embedding inference

from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(torch.Size([24000, 256]), torch.Size([6000, 256]))

In [20]:
# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train_target.csv'))
df_target = df_target.set_index('client_id')
df_target.rename(columns={"bins": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='client_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='client_id')

print(train_df.shape, test_df.shape)

(24000, 258) (6000, 258)


Obtained embeddings can be used as features for model training

For example:

In [21]:
from sklearn.ensemble import RandomForestClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.588

In [22]:
import lightgbm

clf = lightgbm.LGBMClassifier(
    max_depth=6,
    learning_rate=0.02,
    n_estimators=500,
    objective = 'binary',
    subsample= 0.5,
    subsample_freq= 1,
    feature_fraction= 0.75,
    lambda_l1= 1,
    lambda_l2= 1,
    min_data_in_leaf= 50,
    random_state= 42,
    n_jobs= 8
)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)



0.621