In [1]:
if 'google.colab' in str(get_ipython()):
    !pip install pytorch-lifestream

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lifestream
  Downloading pytorch-lifestream-0.5.2.tar.gz (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core>=1.1.2
  Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 KB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting transformers==4.*
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m101.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning==1.6.*
  Dow

In [2]:
import os
import pandas as pd
import torch

In [3]:
if not os.path.exists('data/transactions.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/datasouls-ods/materials/0433a4ca/transactions.zip
    ! unzip -j -o transactions.zip '*.csv' -d data
    ! mv transactions.zip data/

if not os.path.exists('data/education.csv'):
    ! mkdir -p data
    ! curl -OL https://storage.yandexcloud.net/datasouls-ods/materials/e756bf99/train.csv
    #! unzip -j -o transactions.zip '*.csv' -d data
    ! mv train.csv data/


print(f'Loaded csv files')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  250M  100  250M    0     0  20.3M      0  0:00:12  0:00:12 --:--:-- 21.8M
Archive:  transactions.zip
  inflating: data/transactions.csv   
  inflating: data/._transactions.csv  
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  307k  100  307k    0     0   296k      0  0:00:01  0:00:01 --:--:--  296k
Loaded csv files


In [4]:
def trx_types(df):
    df['mcc_code'] = df['mcc_code'].astype(str)
    df['currency_rk'] = df['currency_rk'].astype(str)
    df['event_time'] = pd.to_datetime(df['transaction_dttm']).astype(int) / 1e9
    return df[['user_id', 'event_time', 'mcc_code', 'currency_rk', 'transaction_amt']]


def click_types(df):
    df['event_time'] = pd.to_datetime(df['timestamp']).astype(int) / 1e9
    # df = pd.merge(df, click_categories, on='cat_id')
    df['cat_id'] = df['cat_id'].astype(str)
    return df[['user_id', 'event_time', 'cat_id', 'new_uid']]


In [5]:

data_path = 'data/'

In [6]:
%%time
target = pd.read_csv(os.path.join(data_path, 'train.csv'))

CPU times: user 8.45 ms, sys: 4.93 ms, total: 13.4 ms
Wall time: 18.8 ms


# Transactions preprocessing

In [7]:
%%time
transactions = pd.read_csv(os.path.join(data_path, 'transactions.csv'))
transactions = transactions.dropna()
transactions = trx_types(transactions)

CPU times: user 42.7 s, sys: 6.82 s, total: 49.6 s
Wall time: 49.4 s


In [8]:
transactions.head(2)

Unnamed: 0,user_id,event_time,mcc_code,currency_rk,transaction_amt
0,000932580e404dafbecd5916d4640938,1596442000.0,5411,48,-361.0723
1,000932580e404dafbecd5916d4640938,1596591000.0,5499,48,-137.31398


In [9]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor_trx = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='event_time',
    event_time_transformation='none',
    cols_category=["mcc_code", "currency_rk"],
    cols_numerical=["transaction_amt"],
    return_records=True,
)

In [10]:

%%time
dataset_transactions = preprocessor_trx.fit_transform(transactions)
print(f'Transactions features prepared')

Transactions features prepared
CPU times: user 47.5 s, sys: 8.89 s, total: 56.4 s
Wall time: 57.3 s


In [11]:
del preprocessor_trx
del transactions

# Encoding

In [12]:
dataset_transactions = sorted(dataset_transactions, key=lambda x: x['user_id'])

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset_transactions, test_size=0.2, random_state=42)

len(train), len(test)

(18026, 4507)

In [14]:
train[0].keys()

dict_keys(['user_id', 'event_time', 'mcc_code', 'currency_rk', 'transaction_amt'])

### Model definition

In [16]:
!pip install pyhocon

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyhocon
  Downloading pyhocon-0.3.60.tar.gz (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyhocon
  Building wheel for pyhocon (setup.py) ... [?25l[?25hdone
  Created wheel for pyhocon: filename=pyhocon-0.3.60-py3-none-any.whl size=20883 sha256=cb9e2b3b17a957adff3f1236cdedadbd6095b7a81cb325356f5c6969d0ec349a
  Stored in directory: /root/.cache/pip/wheels/f5/f4/9a/e4bc73f243333e1e0da7d62cff5e60cf258e37936f496538ad
Successfully built pyhocon
Installing collected packages: pyhocon
Successfully installed pyhocon-0.3.60


In [23]:
from functools import partial
from ptls.nn import AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from pyhocon import ConfigFactory

# seq_encoder = AggFeatureSeqEncoder(
#     numeric_values={'transaction_amt': 'identity'},
#     embeddings={
#         'currency_rk': {'in': 100},
#         'mcc_code': {'in': 200},
#     },
#     was_logified = True,
#     log_scale_factor = 1,
# )

def get_conf():
    params = {
        'numeric_values': {
            'transaction_amt': {'identity'},
        },
        'embeddings': {
            'currency_rk': {'in': 100},
            'mcc_code': {'in': 200}
        },
        'was_logified': True,
        'log_scale_factor': 1.0,
    }
    return ConfigFactory.from_dict(params)

seq_encoder = AggFeatureSeqEncoder(**get_conf())

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [31]:
from ptls.frames.supervised import SequenceToTarget


In [24]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=16,
    train_batch_size=256,
)

In [25]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    max_epochs=15,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
# %%time
# print(f'logger.version = {trainer.logger.version}')
# trainer.fit(model, train_dl)
# print(trainer.logged_metrics)

In [None]:
# torch.save(seq_encoder.state_dict(), "coles-emb.pt")

In [32]:
# embedding inference

from ptls.data_load.datasets import inference_data_loader

train_dl = inference_data_loader(train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl, ))

test_dl = inference_data_loader(test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

train_embeds.shape, test_embeds.shape

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


(torch.Size([18026, 906]), torch.Size([4507, 906]))

In [33]:
# join target and embeddings

df_target = pd.read_csv(os.path.join(data_path, 'train.csv'))
df_target.rename(columns={"bank": "user_id"}, inplace=True)
df_target = df_target.set_index('user_id')
df_target.rename(columns={"higher_education": "target"}, inplace=True)

train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['user_id'] = [x['user_id'] for x in train]
# train_df = train_df.merge(df_target, how='left', on='user_id')
train_df = train_df.merge(df_target, on='user_id')

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['user_id'] = [x['user_id'] for x in test]
# test_df = test_df.merge(df_target, how='left', on='user_id')
test_df = test_df.merge(df_target, on='user_id')
print(train_df.shape, test_df.shape)

(6838, 908) (1671, 908)


In [35]:

from sklearn.ensemble import RandomForestClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['target']
x_test, y_test = test_df[embed_columns], test_df['target']

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.7803710353081987

In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
y_pred = clf.predict(x_test)
print("accuracy score:", accuracy_score(y_test, y_pred))
print("precision score:", precision_score(y_test, y_pred))
print("recall score:", recall_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))
print("roc auc_score:", roc_auc_score(y_test, y_pred))

accuracy score: 0.7803710353081987
precision score: 0.7943358057990559
recall score: 0.95
f1 score: 0.8652221814175541
roc auc_score: 0.6211716937354987
