In [1]:
import pandas as pd
import numpy as np
import os
from wtte_lib.wtte_data_preprocessing import data_pipeline

In [2]:
transactions_df = pd.read_csv('../data/transactions.csv')
clients_df = pd.read_csv('../data/clients.csv')
train_df = pd.read_csv('../data/train.csv')
reports_df = pd.read_csv('../data/report_dates.csv')
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df.transaction_dttm)
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'], unit='s').astype('int') // 10**9
transactions_df['mcc_code'] += 1 
transactions_df['ones'] = 1

In [3]:
timestep_aggregation_dict = {'ones': 'sum', 'mcc_code': 'max', 'transaction_amt': 'mean', 'currency_rk': 'max'}
train_data = []
transactions_df_ = transactions_df.merge(clients_df[['user_id', 'report']])
for report in reports_df.report.values:
    data_ = transactions_df_[transactions_df_['report']==report].copy().reset_index()
    df_ = data_pipeline(data_, id_col='user_id', infer_seq_endtime=False, abs_time_col='transaction_dttm', column_names=["ones", 'mcc_code', 'transaction_amt', 'currency_rk'], timestep_aggregation_dict=timestep_aggregation_dict)
    train_data.append(df_)

In [4]:
for data in train_data:
    data[0][:,-1,:] = np.array([-1, -1, -1, -1])
data_lst = []
for data in train_data:
    x_ = np.nan_to_num(data[0], 0).copy()
    x_lst = [pd.DataFrame(x_[i]) for i in range(len(x_))]
    for df in x_lst:
        df['target'] = df[2].map(lambda x: 0 if x else None)
        target = df.target.values
        indices = np.where(~np.isnan(target))[0]
        indices[-1]+=1
        idx = 0
        for i, tgt in enumerate(target):
            if np.isnan(tgt):
                target[i] = indices[idx] - i
            else:
                idx+=1
        df['target'] = target
        df.loc[df.index[-1]] = [0, 0, 0, 0, 1]
    for i in range(len(x_lst)):
        x_lst[i]['user_id'] = [data[2][i]]*x_lst[i].shape[0]
    data_lst.append(pd.concat(x_lst, axis=0))

In [5]:
dataset = pd.concat(data_lst, axis=0)
dataset = dataset.rename(columns={0: 'ones', 1: 'mcc_code', 2: 'transaction_amt', 3: 'currency_rk'})
dataset['trx_dt'] = dataset.groupby('user_id').cumcount()+1

In [6]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='trx_dt',
    event_time_transformation='none',
    cols_category=['mcc_code', 'currency_rk'],
    cols_numerical=['transaction_amt', 'ones'],
    return_records=True,
)


libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


In [7]:
tr_dataset = preprocessor.fit_transform(dataset)

# Model

In [8]:
%load_ext autoreload
%autoreload 2
import torch
import pytorch_lightning as pl
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule
import lion_pytorch
from ptls.frames.coles.losses import SoftmaxLoss
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.utils import collate_feature_dict
from pytorch_lightning.loggers import TensorBoardLogger

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'transaction_amt': 'identity',
                    'ones': 'identity',
                   },
    embeddings={
        'currency_rk': {'in': 5, 'out': 4},
        'mcc_code': {'in': 333, 'out': 8},
    },
)

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=tr_dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=10)
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=20,
            cnt_max=150,
        ),
    ),
    train_num_workers=8,
    train_batch_size=512,
    valid_num_workers=8,
    valid_batch_size=512,
)

inference_dataset = MemoryMapDataset(
    data=tr_dataset,
)

inference_dl = torch.utils.data.DataLoader(
    dataset=inference_dataset,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=128,
    num_workers=8,
)

def get_wtte_coles_embeddings(random_seed):

    seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        hidden_size=800,
        type='gru',
    )

    model = CoLESModule(
        seq_encoder=seq_encoder,
        optimizer_partial=partial(lion_pytorch.Lion, lr=0.0001),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=5, gamma=0.9),
        loss = SoftmaxLoss()
    )
    
    trainer = pl.Trainer(
        logger=TensorBoardLogger('lightning_logs', name=f'coles_{i}'),
        max_epochs=1,
        gpus=1 if torch.cuda.is_available() else 0,
        enable_progress_bar=False,
    )
    trainer.fit(model, train_dl)
    torch.save(model.seq_encoder.state_dict(), f"../models/coles-wtte-model{i}.pt")
    inference_module = InferenceModule(
        model=seq_encoder,
        pandas_output=True,
        drop_seq_features=True,
        model_out_name=f'emb_wtte_coles_{random_seed}')
    
    predict = pl.Trainer(gpus=1).predict(inference_module, inference_dl)
    full_predict = pd.concat(predict, axis=0)
    full_predict.to_csv(f'../embeddings/wtte_coles_{random_seed}.csv', index=False) 

In [9]:
for i in range(1, 6):
    pl.seed_everything(i)
    get_wtte_coles_embeddings(i)

Global seed set to 1
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")


logger.version = 18


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | SoftmaxLoss     | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 2.0 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.848     Total estimated model params size (MB)
  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

Global seed set to 2
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | SoftmaxLoss     | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 2.0 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.848     Total estimated model params size (MB)


logger.version = 20


  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

Global seed set to 3
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | SoftmaxLoss     | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 2.0 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.848     Total estimated model params size (MB)


logger.version = 22


  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

Global seed set to 4
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | SoftmaxLoss     | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 2.0 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.848     Total estimated model params size (MB)


logger.version = 24


  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

Global seed set to 5
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | SoftmaxLoss     | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 2.0 M 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.848     Total estimated model params size (MB)


logger.version = 26


  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: 0it [00:00, ?it/s]

In [10]:
import tqdm
from ptls.data_load.datasets import inference_data_loader
seq_encoder = RnnSeqEncoder(
        trx_encoder=TrxEncoder(**trx_encoder_params),
        hidden_size=800,
        type='gru',
    )
seq_encoder.load_state_dict(torch.load('../models/coles-wtte-model1.pt'))
def pooling_inference(seq_encoder, dl, device='cuda:0'):
    
    seq_encoder.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = seq_encoder.trx_encoder(batch.to(device)).payload
            features_all = []
            for i in range(0, 182, 30):
                x_30 = x[:, i:]
                out_max = torch.max(x_30, dim=1)[0]
                out_min = torch.min(x_30, dim=1)[0]
                out_mean = torch.mean(x_30, dim=1)
                out_std = torch.std(x_30, dim=1)
                features = torch.cat([out_max, out_min, out_mean, out_std], dim=1) 
                features_all.append(features)
            features1 = torch.cat(features_all, dim = 1)
            X += [features1]
    return X

In [11]:
dl = inference_data_loader(tr_dataset, num_workers=0, batch_size=2048)
df_ab = torch.vstack(pooling_inference(seq_encoder, dl)).cpu().numpy()
df_embeds = pd.DataFrame(df_ab, columns=[f"emb_wtte_coles_trx_{e}" for e in range(df_ab.shape[1])])
df_embeds['user_id'] = pd.DataFrame(tr_dataset)['user_id']
df_embeds.to_csv('../embeddings/wtte_coles_trx.csv', index=False)

47it [00:09,  4.79it/s]
