In [None]:
import os
import pickle
import tqdm
import numpy as np
import pandas as pd
import torch
import torchmetrics
import logging
import pytorch_lightning as pl
from copy import deepcopy
from functools import partial
from omegaconf import DictConfig
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from pytorch_lightning.loggers import TensorBoardLogger
import warnings
warnings.filterwarnings("ignore")

from ptls.nn import TrxEncoder, RnnSeqEncoder, Head, L2NormEncoder
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.datasets import MemoryMapDataset, AugmentationDataset
from ptls.data_load.padded_batch import PaddedBatch
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule
from ptls.frames.coles import CoLESModule, ColesDataset
from ptls.frames.coles.losses import SoftmaxLoss
from ptls.frames.coles.metric import BatchRecallTopK
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.augmentations import RandomSlice, DropoutTrx

## Data preprocessing

In [None]:
transactions_df = pd.read_csv('../data/transactions.csv')
clients_df = pd.read_csv('../data/clients.csv')
train_df = pd.read_csv('../data/train.csv')
reports_df = pd.read_csv('../data/report_dates.csv')

In [None]:
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df.transaction_dttm)
report_dates = pd.read_csv('../data/report_dates.csv', parse_dates=['report_dt'])
df_ = transactions_df.merge(clients_df[['user_id', 'report']], how='left', on='user_id')
df_ = df_.merge(report_dates, how='left', on='report')
transactions_df['days_to_report'] = (df_['report_dt'] - df_['transaction_dttm']).dt.days

In [None]:
# Кодируем день недели, добавляем флаг выходного дня
days_of_week = {'Monday': 1,
                'Tuesday': 2,
                'Wednesday': 3,
                'Thursday': 4,
                'Friday': 5,
                'Saturday': 6,
                'Sunday': 7
               }

transactions_df['day_of_week'] = transactions_df['transaction_dttm'].dt.day_name()
for k, v in days_of_week.items():
    transactions_df['day_of_week'].replace(k,v,inplace= True)
    
transactions_df["is_day_off"] = transactions_df['day_of_week'].map(lambda x: 1 if x in (6,7) else 0)

In [None]:
# Добавляем количество дней, часов с момента первой и предыдущей транзакций
first_trx = transactions_df.groupby('user_id')['transaction_dttm'].min().reset_index()
first_trx.rename(columns={'transaction_dttm': 'first_tr'}, inplace=True)
transactions_df = transactions_df.merge(first_trx, on='user_id', how='left')

transactions_df['days_from_first_tr'] = (transactions_df['transaction_dttm']-transactions_df['first_tr'])/ np.timedelta64(1, 'D')
transactions_df['days_from_first_tr'] = (transactions_df['days_from_first_tr']).astype('int')
transactions_df['days_from_prev_tr'] = transactions_df['transaction_dttm'].diff()/ np.timedelta64(1, 'D')
transactions_df['days_from_prev_tr'] = transactions_df['days_from_prev_tr'].fillna(0)

transactions_df['days_from_prev_tr'] = (transactions_df['days_from_prev_tr']).astype('int')

transactions_df['hours_from_first_tr'] = (transactions_df['transaction_dttm']-transactions_df['first_tr'])/ np.timedelta64(1, 'h')
transactions_df['hours_from_prev_tr'] = transactions_df['transaction_dttm'].diff()/ np.timedelta64(1, 'h')
transactions_df['hours_from_prev_tr'] = transactions_df['hours_from_prev_tr'].fillna(0)
transactions_df['hour'] = transactions_df['transaction_dttm'].dt.hour

transactions_df = transactions_df.drop(columns=['first_tr'])

In [None]:
cat_cols_ = ['mcc_code',
             'currency_rk',
             'day_of_week',
             'is_day_off',
             'hour'
            ]
num_cols_ = ['transaction_amt',
              'days_from_first_tr',
              'days_from_prev_tr',
              'hours_from_first_tr',
              'hours_from_prev_tr',
            ]         

In [None]:
trx_preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='transaction_dttm',
    event_time_transformation='dt_to_timestamp',
    cols_category=[
        'mcc_code',
        'currency_rk',
        'day_of_week',
        'is_day_off',
        'hour',
    ],
    cols_numerical=[
        'transaction_amt',
        'days_from_first_tr',
        'days_from_prev_tr',
        'hours_from_first_tr',
        'hours_from_prev_tr',
        'days_to_report',
    ],
    return_records=False,
)

In [None]:
%%time

tr_dataset = trx_preprocessor.fit_transform(transactions_df)

In [None]:
df = tr_dataset.merge(train_df[['user_id', 'target', 'time']], on='user_id')
test_ids = pd.read_csv('../data/test_ids.csv')
df_train, df_test = df.loc[~df['user_id'].isin(test_ids['user_id'])], df.loc[df['user_id'].isin(test_ids['user_id'])]

## Coles Pretrain

In [None]:
coles_train_df, coles_valid_df = train_test_split(tr_dataset, random_state=42, test_size=0.1)

In [None]:
coles_dataset = PtlsDataModule(
    train_data=ColesDataset(
        AugmentationDataset(
            coles_train_df.to_dict(orient='records'),
            f_augmentations=[
                DropoutTrx(0.1),
            ],
        ),
        splitter=SampleSlices(5, 10, 80),
    ),
    valid_data=ColesDataset(
        coles_valid_df.to_dict(orient='records'),
        splitter=SampleSlices(5, 10, 80),
    ),
    train_batch_size=256,
    train_num_workers=8,
    train_drop_last=True,
    
    valid_batch_size=256,
    valid_num_workers=8,
    valid_drop_last=True
)

In [None]:
trx_encoder_params = dict(
    numeric_values={
        'transaction_amt': 'log',
        'days_from_first_tr': 'identity',
        'days_from_prev_tr': 'log',
        'hours_from_first_tr': 'identity',
        'hours_from_prev_tr': 'log',
        'days_to_report': 'log',
    },
    embeddings={
        'currency_rk': {'in': 5, 'out': 16},
        'day_of_week': {'in': 8, 'out': 16},
        'mcc_code': {'in': 330, 'out': 32},
        'is_day_off': {'in': 4, 'out': 2},
        'hour': {'in': 30, 'out': 4}
    },
    use_batch_norm_with_lens=True,
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=800,
    type='gru',
)

# seq_encoder.load_state_dict(torch.load(os.path.join(PATH, 'model/seq_encoder.pt')))

In [None]:
coles_module = CoLESModule(
    seq_encoder=seq_encoder,
    head=Head(use_norm_encoder=True),
    loss=SoftmaxLoss(),
    validation_metric=BatchRecallTopK(K=4),
    optimizer_partial=partial(torch.optim.AdamW, lr=1e-3, weight_decay=1e-4),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.9),
)

In [None]:
trainer = pl.Trainer(
    max_epochs=50,
    gpus=1,
    enable_progress_bar=True,
    gradient_clip_algorithm='value',
    gradient_clip_val=0.2
)

In [None]:
trainer.fit(coles_module, coles_dataset)

In [None]:
torch.save(seq_encoder.state_dict(), '../models/seq_encoder-0.1.pt')

## Supervised Finetune

In [None]:
class SeqToTargetDataset(torch.utils.data.Dataset):
    def __init__(self,
                 data,
                 target_col_name,
                 target_dtype=None,
                 *args, **kwargs,
                 ):
        super().__init__(*args, **kwargs)

        self.data = data
        
        self.target_col_name = target_col_name
        if type(target_dtype) is str:
            self.target_dtype = getattr(torch, target_dtype)
        else:
            self.target_dtype = target_dtype

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        feature_arrays = self.data[item]
        return feature_arrays

    def __iter__(self):
        for feature_arrays in self.data:
            yield feature_arrays

    def collate_fn(self, padded_batch):
        padded_batch = collate_feature_dict(padded_batch)
        
        target = padded_batch.payload[self.target_col_name]
        time = padded_batch.payload['time']
        del padded_batch.payload[self.target_col_name]
        if self.target_dtype is not None:
            target = target.to(dtype=self.target_dtype)

        return padded_batch, target, time


class SeqToTargetIterableDataset(SeqToTargetDataset, torch.utils.data.IterableDataset):
    pass

In [None]:
class SequenceToTarget(pl.LightningModule):
    def __init__(
        self,
        seq_encoder: torch.nn.Module,
        head: torch.nn.Module=None,
        head_time: torch.nn.Module=None,
        loss: torch.nn.Module=None,
        metric_list: torchmetrics.Metric=None,
        optimizer_partial=None,
        lr_scheduler_partial=None,
        pretrained_lr=None,
        train_update_n_steps=None
    ):
        super().__init__()

        self.save_hyperparameters(ignore=[
            'seq_encoder', 'head', 'head_time', 'loss',
            'metric_list', 'optimizer_partial', 'lr_scheduler_partial'
        ])
        self.seq_encoder = seq_encoder
        self.head = head
        self.head_time = head_time
        self.loss = loss

        if type(metric_list) is dict or type(metric_list) is DictConfig:
            metric_list = [(k, v) for k, v in metric_list.items()]
        else:
            if type(metric_list) is not list:
                metric_list = [metric_list]
            metric_list = [(m.__class__.__name__, m) for m in metric_list]

        self.train_metrics = torch.nn.ModuleDict([(name, deepcopy(mc)) for name, mc in metric_list])
        self.valid_metrics = torch.nn.ModuleDict([(name, deepcopy(mc)) for name, mc in metric_list])

        self.optimizer_partial = optimizer_partial
        self.lr_scheduler_partial = lr_scheduler_partial
        
    def forward(self, x):
        add_features = None
        
        if isinstance(x, tuple):
            x, add_features = x

        x = self.seq_encoder(x)
        
        if self.head is not None:
            y_h = self.head(x)
        else:
            y_h = x
            
        t_h = self.head_time(x)
        
        return y_h, t_h

    def training_step(self, batch, _):
        x, y, t = batch
        y_h, t_h = self(x)

        loss = self.loss(y_h, y)
        self.log('loss/train_loss', loss)
        for name, mf in self.train_metrics.items():
            mf(y_h, y)
            
        loss_t = (t_h - t / 100.0).pow(2).mean()
        self.log('loss/loss_time', loss_t)
        return loss + 0.1 * loss_t

    def training_epoch_end(self, outputs):
        for name, mf in self.train_metrics.items():
            self.log(f'{name}/train', mf.compute(), prog_bar=False)
        for name, mf in self.train_metrics.items():
            mf.reset()

    def validation_step(self, batch, _):
        x, y, t = batch
        y_h, t_h = self(x)
        self.log('loss/valid', self.loss(y_h, y))
        for name, mf in self.valid_metrics.items():
            mf(y_h, y)

    def validation_epoch_end(self, outputs):
        for name, mf in self.valid_metrics.items():
            self.log(f'{name}/valid', mf.compute(), prog_bar=True)
        for name, mf in self.valid_metrics.items():
            mf.reset()

    def configure_optimizers(self):
        if self.hparams.pretrained_lr is not None:
            if self.hparams.pretrained_lr == 'freeze':
                for p in self.seq_encoder.parameters():
                    p.requires_grad = False
                parameters = self.parameters()
            else:
                parameters = [
                    {'params': self.seq_encoder.parameters(), 'lr': self.hparams.pretrained_lr},
                    {'params': self.head.parameters()},  # use predefined lr from `self.optimizer_partial`
                ]
        else:
            parameters = self.parameters()

        optimizer = self.optimizer_partial(parameters)
        scheduler = self.lr_scheduler_partial(optimizer)
        return [optimizer], [scheduler]

In [None]:
def inference(model, dl, device='cuda:0'):
    logits = []
    model.to(device)
    softmax = torch.nn.Softmax(dim=0) 
    for batch in tqdm.tqdm(dl, position=0, leave=True):
        with torch.no_grad():
            x, _, _ = batch
            y_h, t_h = model(x.to(device))
            logits.extend([y_h.cpu()])
        
    logits = softmax(torch.vstack(logits)[:, 1]).cpu()
    
    torch.cuda.empty_cache()

    return logits

In [None]:
skf = StratifiedKFold(n_splits=5)
models = []
predictions_5folds = []

In [None]:
for i, (train_index, test_index) in enumerate(skf.split(df_train.drop(columns=['target']), df_train['target'])):
    train_, test_ = df_train.iloc[train_index], df_train.iloc[test_index]
    
    dataset_train = train_.to_dict(orient='records')
    dataset_test = test_.to_dict(orient='records')
    
    sup_dataset = PtlsDataModule(
        train_data=SeqToTargetDataset(
            AugmentationDataset(
                dataset_train,
                f_augmentations=[
                    DropoutTrx(0.1),
                ],
            ),
            target_col_name='target',
            target_dtype=torch.long,
        ),
        valid_data=SeqToTargetDataset(
            dataset_test,
            target_col_name='target',
            target_dtype=torch.long,
        ),
        train_batch_size=256,
        train_num_workers=8,
        train_drop_last=True,

        valid_batch_size=256,
        valid_num_workers=8,
        valid_drop_last=True
    )
    
    seq_encoder.load_state_dict(torch.load('../models/seq_encoder-0.1.pt'))

    sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=torch.nn.Sequential(
            torch.nn.Linear(seq_encoder.embedding_size, 512),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(512, 2),
            torch.nn.LogSoftmax(dim=1),
        ),
        head_time=torch.nn.Sequential(
            torch.nn.Linear(seq_encoder.embedding_size, 1),
        ),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.AUROC(num_classes=2),
        optimizer_partial=partial(torch.optim.AdamW, lr=1e-3, weight_decay=0.0),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=1, gamma=0.2),
    )
    
    # sup_module.load_state_dict(torch.load(f"model/sup_modules-kfold/model-1.{i}.pt"))
    
    trainer = pl.Trainer(
        logger=TensorBoardLogger('lightning_logs', name=f'CoLES-supervised-agg_{i}'),
        max_epochs=6,
        gpus=1 if torch.cuda.is_available() else 0,
        enable_progress_bar=True,
        gradient_clip_algorithm='norm',
        gradient_clip_val=0.18
    )
    
    trainer.fit(sup_module, sup_dataset)
    
    torch.save(sup_module.state_dict(), f"../models/sup_modules-kfold.{i}.pt")
    
    predictions_test = test_[["user_id"]].copy()
    
    dataset = SeqToTargetDataset(
        data=dataset_test,
        target_col_name='target',
    )

    dl = torch.utils.data.DataLoader(
        dataset=dataset,
        collate_fn=dataset.collate_fn,
        shuffle=False,
        batch_size=512,
        num_workers=4,
    )
    
    predictions_test["sp"] = inference(sup_module, dl)
    
    predictions_5folds.append(predictions_test)
    
    print(12*"-")
    print("AUROC; 5th fold:", roc_auc_score(test_["target"].values, predictions_test["sp"]))
    print(12*"-")
    
    models.append(deepcopy(sup_module))

In [None]:
predictions_5folds = pd.concat(predictions_5folds, axis=0)
temp = predictions_5folds.merge(train_df[["user_id", "target"]], on="user_id")
print(12*"-")
print("AUROC; 5th fold:", roc_auc_score(temp["target"].values, temp["sp"].values))
print(12*"-")

In [None]:
dataset_test = df_test.copy()
dataset_test[["target", "time"]] = None
dataset_test = dataset_test.to_dict(orient='records')

dataset = SeqToTargetDataset(
    data=dataset_test,
    target_col_name='target',
)

dl = torch.utils.data.DataLoader(
    dataset=dataset,
    collate_fn=dataset.collate_fn,
    shuffle=False,
    batch_size=512,
    num_workers=4,
)

predictions_test = df_test[["user_id"]].copy()

for i in range(5):
    predictions_test[f"sp_{i}"] = inference(models[i], dl)
predictions_test["sp"] = predictions_test.iloc[:, 1:].mean(axis=1)
predictions_test
print(12*"-")
print("AUROC; 5th fold:", roc_auc_score(df_test["target"].values, predictions_test["sp"].values))
print(12*"-")
train_test_predictions = pd.concat([predictions_5folds, predictions_test[["user_id", "sp"]]], axis=0)
#train_test_predictions.to_csv("sp-preds_train-test.csv", index=False)

---

In [None]:
sample_submit = pd.read_csv("../data/sample_submit_naive.csv")
sbmt_df = tr_dataset.merge(sample_submit[['user_id']], on='user_id')
sbmt_df[["target", "time"]] = None

In [None]:
sbmt_df

In [None]:
sbmt_models = []
for i, (train_index, test_index) in enumerate(skf.split(df.drop(columns=['target']), df["target"])):
    train_, test_ = df.iloc[train_index], df.iloc[test_index]
    
    dataset_train = train_.to_dict(orient='records')
    dataset_test = test_.to_dict(orient='records')
    
    sup_dataset = PtlsDataModule(
        train_data=SeqToTargetDataset(
            AugmentationDataset(
                dataset_train,
                f_augmentations=[
                    DropoutTrx(0.1),
                ],
            ),
            target_col_name='target',
            target_dtype=torch.long,
        ),
        valid_data=SeqToTargetDataset(
            dataset_test,
            target_col_name='target',
            target_dtype=torch.long,
        ),
        train_batch_size=256,
        train_num_workers=8,
        train_drop_last=True,

        valid_batch_size=256,
        valid_num_workers=8,
        valid_drop_last=True
    )
    
    seq_encoder.load_state_dict(torch.load('../models/seq_encoder-0.1.pt'))

    sup_module = SequenceToTarget(
        seq_encoder=seq_encoder,
        head=torch.nn.Sequential(
            torch.nn.Linear(seq_encoder.embedding_size, 512),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(512, 2),
            torch.nn.LogSoftmax(dim=1),
        ),
        head_time=torch.nn.Sequential(
            torch.nn.Linear(seq_encoder.embedding_size, 1),
        ),
        loss=torch.nn.NLLLoss(),
        metric_list=torchmetrics.AUROC(num_classes=2),
        optimizer_partial=partial(torch.optim.AdamW, lr=4e-4, weight_decay=0.0),
        lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=1, gamma=0.2),
    )
    
    trainer = pl.Trainer(
        logger=TensorBoardLogger('lightning_logs', name=f'CoLES-supervised-agg-sbmt_{i}'),
        max_epochs=12,
        gpus=1 if torch.cuda.is_available() else 0,
        enable_progress_bar=True,
        gradient_clip_algorithm='norm',
        gradient_clip_val=0.2
    )
    
    trainer.fit(sup_module, sup_dataset)
    
    #torch.save(sup_module.state_dict(), f"model/sup_modules-kfold/sbmt-model-0.{i}.pt")
    
    dataset = SeqToTargetDataset(
        data=dataset_test,
        target_col_name='target',
    )

    dl = torch.utils.data.DataLoader(
        dataset=dataset,
        collate_fn=dataset.collate_fn,
        shuffle=False,
        batch_size=512,
        num_workers=4,
    )
    
    predictions_test = inference(sup_module, dl)
    
    print(12*"-")
    print("AUROC; 5th fold:", roc_auc_score(test_["target"].values, predictions_test))
    print(12*"-")
    
    sbmt_models.append(deepcopy(sup_module))

In [None]:
dataset_sbmt = sbmt_df.copy()
dataset_sbmt[["target", "time"]] = None
dataset_sbmt = dataset_sbmt.to_dict(orient='records')

dataset = SeqToTargetDataset(
    data=dataset_sbmt,
    target_col_name='target',
)

dl = torch.utils.data.DataLoader(
    dataset=dataset,
    collate_fn=dataset.collate_fn,
    shuffle=False,
    batch_size=512,
    num_workers=4,
)

predictions_sbmt = sbmt_df[["user_id"]].copy()

for i in range(5):
    predictions_sbmt[f"sp_{i}"] = inference(sbmt_models[i], dl)

predictions_sbmt

In [None]:
predictions_sbmt["sp"] = predictions_sbmt.iloc[:, 1:].mean(axis=1)
predictions_sbmt

In [None]:
final_prediction = pd.concat([predictions_sbmt[["user_id", "sp"]], train_test_predictions])

In [None]:
final_prediction.to_csv("../predictions/sup-preds.csv", index=False)