In [1]:
import pandas as pd
import numpy as np
import os

## Data preprocessing

In [2]:
transactions_df = pd.read_csv('../data/transactions.csv')
clients_df = pd.read_csv('../data/clients.csv')
train_df = pd.read_csv('../data/train.csv')
reports_df = pd.read_csv('../data/report_dates.csv')

In [3]:
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df.transaction_dttm)
report_dates = pd.read_csv('../data/report_dates.csv', parse_dates=['report_dt'])
df_ = transactions_df.merge(clients_df[['user_id', 'report']], how='left', on='user_id')
df_ = df_.merge(report_dates, how='left', on='report')
transactions_df['days_to_report'] = (df_['report_dt'] - df_['transaction_dttm']).dt.days

In [4]:
# Добавляем количество дней, часов с момента первой и предыдущей транзакций
first_trx = transactions_df.groupby('user_id')['transaction_dttm'].min().reset_index()
first_trx.rename(columns={'transaction_dttm': 'first_tr'}, inplace=True)
transactions_df = transactions_df.merge(first_trx, on='user_id', how='left')

transactions_df['days_from_first_tr'] = (transactions_df['transaction_dttm']-transactions_df['first_tr'])/ np.timedelta64(1, 'D')
transactions_df['days_from_first_tr'] = (transactions_df['days_from_first_tr']).astype('int')
transactions_df['days_from_prev_tr'] = transactions_df['transaction_dttm'].diff()/ np.timedelta64(1, 'D')
transactions_df['days_from_prev_tr'] = transactions_df['days_from_prev_tr'].fillna(0)

transactions_df['days_from_prev_tr'] = (transactions_df['days_from_prev_tr']).astype('int')

transactions_df['hours_from_first_tr'] = (transactions_df['transaction_dttm']-transactions_df['first_tr'])/ np.timedelta64(1, 'h')
transactions_df['hours_from_prev_tr'] = transactions_df['transaction_dttm'].diff()/ np.timedelta64(1, 'h')
transactions_df['hours_from_prev_tr'] = transactions_df['hours_from_prev_tr'].fillna(0)

transactions_df = transactions_df.drop(columns=['first_tr'])

In [5]:
# Кодируем день недели, добавляем флаг выходного дня
days_of_week = {'Monday': 1,
                'Tuesday': 2,
                'Wednesday': 3,
                'Thursday': 4,
                'Friday': 5,
                'Saturday': 6,
                'Sunday': 7
               }

transactions_df['day_of_week'] = transactions_df['transaction_dttm'].dt.day_name()
for k, v in days_of_week.items():
    transactions_df['day_of_week'].replace(k,v,inplace= True)
    
transactions_df["is_day_off"] = transactions_df['day_of_week'].map(lambda x: 1 if x in (6,7) else 0)

In [6]:
cat_cols_ = ['mcc_code',
             'currency_rk',
             'day_of_week',
             'is_day_off',]
num_cols_ = ['transaction_amt',
              'days_from_first_tr',
              'days_from_prev_tr',
              'hours_from_first_tr',
              'hours_from_prev_tr',
            ]                              

In [7]:
from ptls.preprocessing import PandasDataPreprocessor

trx_preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='transaction_dttm',
    event_time_transformation='dt_to_timestamp',
    cols_category=cat_cols_,
    cols_numerical=num_cols_,
    return_records=True,
)


libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [None]:
%%time

dataset = trx_preprocessor.fit_transform(transactions_df)

In [None]:
%load_ext autoreload
%autoreload 2
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
import lion_pytorch
from ptls.frames.coles.losses import SoftmaxLoss
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.utils import collate_feature_dict

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'transaction_amt': 'identity',
                    'days_from_prev_tr': 'identity',
                   },
    embeddings={
        'currency_rk': {'in': 5, 'out': 8},
        'day_of_week': {'in': 8, 'out': 8},
        'mcc_code': {'in': 333, 'out': 16},
        },
    )

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=20)
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=150,
        ),
    ),
    train_num_workers=8,
    train_batch_size=512,
)

inference_dataset = MemoryMapDataset(
    data=dataset,
)

inference_dl = torch.utils.data.DataLoader(
    dataset=inference_dataset,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=8,
)

def get_coles_embeddings(random_seed):

    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        hidden_size=1024,
        type='gru',
    )

    model = CoLESModule(
        seq_encoder=seq_encoder,
        optimizer_partial=partial(lion_pytorch.Lion, lr=0.0001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
        loss = SoftmaxLoss()
    )
    
    trainer = pl.Trainer(
        max_epochs=1,
        gpus=1 if torch.cuda.is_available() else 0,
        enable_progress_bar=False,
    )
    print(f'logger.version = {trainer.logger.version}')
    trainer.fit(model, train_dl)
    torch.save(model.seq_encoder.state_dict(), f"../models/coles-model{i}.pt")
    inference_module = InferenceModule(
        model=seq_encoder,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name=f'emb_{random_seed}')
    
    predict = pl.Trainer(gpus=1).predict(inference_module, inference_dl)
    full_predict = pd.concat(predict, axis=0)
    full_predict.to_csv(f'../embeddings/coles_{random_seed}.csv', index=False) 

In [None]:
for i in range(1, 6):
    pl.seed_everything(i)
    get_coles_embeddings(i)