In [1]:
if 'google.colab' in str(get_ipython()):
    !pip install pytorch-lifestream

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os

if not os.path.exists('data/transactions_train.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/di-datasets/rosbank-ml-contest-boosters.pro.zip
    ! unzip -j -o rosbank-ml-contest-boosters.pro.zip '*.csv' -d data
    ! mv rosbank-ml-contest-boosters.pro.zip data/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 8979k  100 8979k    0     0  4296k      0  0:00:02  0:00:02 --:--:-- 4296k
Archive:  rosbank-ml-contest-boosters.pro.zip
  inflating: data/test.csv           
  inflating: data/train.csv          


In [3]:
%load_ext autoreload
%autoreload 2

import torch
import pytorch_lightning as pl

In [4]:
import os
import pandas as pd

data_path = 'data/'

source_data = pd.read_csv(os.path.join(data_path, 'train.csv'))
source_data['TRDATETIME'] =  pd.to_datetime(source_data['TRDATETIME'], format='%d%b%y:%H:%M:%S')
# source_data = source_data.sort_values(by='TRDATETIME', ascending=True).reset_index()
# source_data['TRDATETIME'] = source_data.index
# source_data.drop(columns=['index'], inplace=True)
source_data.head(2)

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,2017-10-21 00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,2017-10-12 12:24:07,20000.0,DEPOSIT,0,0.0


In [5]:
initial_test = pd.read_csv(os.path.join(data_path, 'test.csv'))
initial_test['TRDATETIME'] =  pd.to_datetime(initial_test['TRDATETIME'], format='%d%b%y:%H:%M:%S')
# initial_test = initial_test.sort_values(by='TRDATETIME', ascending=True).reset_index()
# initial_test['TRDATETIME'] = initial_test.index
# initial_test.drop(columns=['index'], inplace=True)
initial_test.head(2)

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category
0,01/04/2017,2,5814,,810,2017-04-18,153.0,POS
1,01/03/2017,2,5812,,810,2017-03-10,388.08,POS


In [6]:
whole_dataset = pd.concat([
    source_data.drop(columns=[
        'target_flag', 'target_sum'
    ]), initial_test]).sort_values(by='TRDATETIME', ascending=True).reset_index()
whole_dataset['TRDATETIME'] = whole_dataset.index
whole_dataset.drop(columns=['index'], inplace=True)
whole_dataset.head(2)

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category
0,01/10/2016,1290,5411,type2,810,0,2465.0,POS
1,01/10/2016,7689,5921,type1,810,1,119.78,POS


In [7]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='cl_id',
    col_event_time='TRDATETIME',
    event_time_transformation='none',
    cols_category=['PERIOD', 'MCC', 'channel_type', 'currency', 'trx_category'],
    cols_numerical=['amount'],
    return_records=True,
)

In [8]:
%%time

whole_dataset = preprocessor.fit_transform(whole_dataset.sample(frac=0.4, random_state=42))

CPU times: user 11.5 s, sys: 240 ms, total: 11.7 s
Wall time: 18.9 s


In [9]:
import pickle

with open('preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [10]:
initial_train = source_data.sort_values(by='TRDATETIME', ascending=True).reset_index()
initial_train['TRDATETIME'] = initial_train.index
target = initial_train[['cl_id', 'target_flag', 'target_sum']]
initial_train.drop(columns=['index', 'target_flag', 'target_sum'], inplace=True)
initial_train = preprocessor.fit_transform(initial_train)

In [11]:
dataset = sorted(initial_train, key=lambda x: x['cl_id'])

In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2, random_state=42)

len(train), len(test)

(4000, 1000)

In [13]:
train[0].keys()

dict_keys(['cl_id', 'TRDATETIME', 'event_time', 'PERIOD', 'MCC', 'channel_type', 'currency', 'trx_category', 'amount'])

In [14]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'amount': 'identity'},
    embeddings={
        'TRDATETIME': {'in': 800, 'out': 16},
        'MCC': {'in': 250, 'out': 16},
        'channel_type': {'in': 250, 'out': 16},
        'currency': {'in': 250, 'out': 16},
        'PERIOD': {'in': 250, 'out': 16},
        'trx_category': {'in': 250, 'out': 16}
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [15]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=whole_dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

In [16]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [17]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")


logger.version = 5


INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 305 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
305 K     Trainable params
0         Non-trainable params
305 K     Total params
1.223     Total estimated model params size (MB)
  rank_zero_warn(


{'loss': tensor(116.8565), 'seq_len': tensor(42.6269)}
CPU times: user 1min 21s, sys: 19.8 s, total: 1min 41s
Wall time: 3min 16s


In [18]:
# embedding inference

from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(torch.Size([4000, 256]), torch.Size([1000, 256]))

In [19]:
# join target and embeddings

df_target = target.set_index('cl_id')

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['cl_id'] = [x['cl_id'] for x in train]
train_df = train_df.merge(df_target, how='left', on='cl_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['cl_id'] = [x['cl_id'] for x in test]
test_df = test_df.merge(df_target, how='left', on='cl_id')

print(train_df.shape, test_df.shape)

(392818, 259) (97695, 259)


In [20]:
from sklearn.ensemble import RandomForestClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target_flag']
x_test, y_test = test_df[embed_columns], test_df['target_flag']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))

from sklearn.metrics import roc_auc_score 
print(roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1]))

0.7554224883566201
0.808473324012574


In [21]:
import lightgbm

clf = lightgbm.LGBMClassifier(
    max_depth=6,
    learning_rate=0.02,
    n_estimators=500,
    objective = 'binary',
    subsample= 0.5,
    subsample_freq= 1,
    feature_fraction= 0.75,
    lambda_l1= 1,
    lambda_l2= 1,
    min_data_in_leaf= 50,
    random_state= 42,
    n_jobs= 8
)
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))

from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1]))

0.7503249910435539
0.8153928736891062


In [22]:
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.metrics import mean_squared_error

# est = GradientBoostingRegressor(
#     n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0,
#     loss='squared_error'
#     ).fit(x_train, y_train)
# mean_squared_error(y_test, est.predict(x_test))

In [23]:
# import numpy as np
# from sklearn.svm import SVR
# import matplotlib.pyplot as plt

# svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1).fit(x_train, y_train)
# print(mean_squared_error(y_test, svr_rbf.predict(x_test)))

# svr_lin = SVR(kernel='linear', C=100, gamma='auto').fit(x_train, y_train)
# print(mean_squared_error(y_test, svr_lin.predict(x_test)))

# svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=0.1, coef0=1).fit(x_train, y_train)
# print(mean_squared_error(y_test, svr_poly.predict(x_test)))