In [None]:
import pandas as pd
import numpy as np
import os
from wtte_lib.wtte_data_preprocessing import data_pipeline

In [None]:
transactions_df = pd.read_csv('../data/transactions.csv')
clients_df = pd.read_csv('../data/clients.csv')
train_df = pd.read_csv('../data/train.csv')
reports_df = pd.read_csv('../data/report_dates.csv')
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df.transaction_dttm)
transactions_df['transaction_dttm'] = pd.to_datetime(transactions_df['transaction_dttm'], unit='s').astype('int') // 10**9
transactions_df['mcc_code'] += 1 
transactions_df['ones'] = 1


In [None]:
timestep_aggregation_dict = {'ones': 'sum', 'mcc_code': 'max', 'transaction_amt': 'mean', 'currency_rk': 'max'}
train_data = []
transactions_df_ = transactions_df.merge(clients_df[['user_id', 'report']])
for report in reports_df.report.values:
    data_ = transactions_df_[transactions_df_['report']==report].copy().reset_index()
    df_ = data_pipeline(data_, id_col='user_id', infer_seq_endtime=False, abs_time_col='transaction_dttm', column_names=["ones", 'mcc_code', 'transaction_amt', 'currency_rk'], timestep_aggregation_dict=timestep_aggregation_dict)
    train_data.append(df_)

In [None]:
for data in train_data:
    data[0][:,-1,:] = np.array([-1, -1, -1, -1])
data_lst = []
for data in train_data:
    x_ = np.nan_to_num(data[0], 0).copy()
    x_lst = [pd.DataFrame(x_[i]) for i in range(len(x_))]
    for df in x_lst:
        df['target'] = df[2].map(lambda x: 0 if x else None)
        target = df.target.values
        indices = np.where(~np.isnan(target))[0]
        indices[-1]+=1
        idx = 0
        for i, tgt in enumerate(target):
            if np.isnan(tgt):
                target[i] = indices[idx] - i
            else:
                idx+=1
        df['target'] = target
        df.loc[df.index[-1]] = [0, 0, 0, 0, 1]
    for i in range(len(x_lst)):
        x_lst[i]['user_id'] = [data[2][i]]*x_lst[i].shape[0]
    data_lst.append(pd.concat(x_lst, axis=0))

In [None]:
dataset = pd.concat(data_lst, axis=0)
dataset = dataset.rename(columns={0: 'ones', 1: 'mcc_code', 2: 'transaction_amt', 3: 'currency_rk'})
dataset['trx_dt'] = dataset.groupby('user_id').cumcount()+1
dataset['is_censored'] = 0

In [None]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='user_id',
    col_event_time='trx_dt',
    event_time_transformation='none',
    cols_category=['mcc_code', 'currency_rk'],
    cols_numerical=['transaction_amt', 'ones', 'target', 'is_censored'],
    return_records=True,
)

In [None]:
tr_dataset = preprocessor.fit_transform(dataset)

In [None]:
def return_train_part(tr_dict):
    new_dict = []
    for i in range(len(tr_dict)):
        new_dict.append(dict())
        for key, value in tr_dict[i].items():
            if key == 'user_id':
                new_dict[i][key] = value
            else:
                new_dict[i][key] = value[:150]
    return new_dict
train = return_train_part(tr_dataset)

In [None]:
def return_valid_part(tr_dict):
    new_dict = []
    for i in range(len(tr_dict)):
        new_dict.append(dict())
        for key, value in tr_dict[i].items():
            if key == 'user_id':
                new_dict[i][key] = value
            else:
                new_dict[i][key] = value[150:]
    return new_dict
valid = return_valid_part(tr_dataset)

In [None]:
from math import log
import torch
from torch import nn

EPS = torch.finfo(torch.float32).eps


def log_likelihood_discrete(tte, uncensored, alpha, beta, epsilon=EPS):
    hazard_0 = torch.pow((tte + epsilon) / alpha, beta)
    hazard_1 = torch.pow((tte + 1.0) / alpha, beta)
    return uncensored * torch.log(torch.exp(hazard_1 - hazard_0) - (1.0 - epsilon)) - hazard_1


def log_likelihood_continuous(tte, uncensored, alpha, beta, epsilon=EPS):
    y_a = (tte + epsilon) / alpha
    return uncensored * (torch.log(beta) + beta * torch.log(y_a)) - torch.pow(y_a, beta)


def weibull_censored_nll_loss(
    inputs: torch.tensor,
    targets: torch.tensor,
    discrete: bool = False,
    reduction: str = "mean",
    clip_prob=1e-6,
):
    alpha = inputs[..., 0]
    beta = inputs[..., 1]
    tte = targets[..., 0]
    uncensored = targets[..., 1]
    reducer = {"mean": torch.mean, "sum": torch.sum}.get(reduction)
    likelihood = log_likelihood_discrete if discrete else log_likelihood_continuous
    log_likelihoods = likelihood(tte, uncensored, alpha, beta)
    if reducer:
        log_likelihoods = reducer(log_likelihoods, dim=-1) / 256
    return -1.0 * log_likelihoods


class WeibullCensoredNLLLoss(nn.Module):

    def __init__(self, discrete: bool = False, reduction: str = "mean", clip_prob=1e-6):
        super().__init__()
        self.discrete = discrete
        self.reduction = reduction
        self.clip_prob = clip_prob

    def forward(
        self,
        inputs: torch.tensor,
        target: torch.tensor,
    ):
        return weibull_censored_nll_loss(
            inputs, target, self.discrete, self.reduction, self.clip_prob
        )


class WeibullActivation(nn.Module):

    def __init__(self, init_alpha: float = 1.0, max_beta: float = 5.0, epsilon: float = EPS):
        super().__init__()
        self.init_alpha = torch.tensor(init_alpha)
        self.max_beta = torch.tensor(max_beta)
        self.epsilon = epsilon

    def forward(self, x: torch.tensor):

        alpha = x[..., 0]
        beta = x[..., 1]

        alpha = self.init_alpha * torch.exp(alpha)

        if self.max_beta > 1.05:
            shift = torch.log(self.max_beta - 1.0)
            beta = beta - shift

        beta = self.max_beta * torch.clamp(
            torch.sigmoid(beta), min=self.epsilon, max=1.0 - self.epsilon
        )

        return torch.stack([alpha, beta], axis=-1)

# Model

In [None]:
import logging
from copy import deepcopy

import pandas as pd
import pytorch_lightning as pl
import torch
import torchmetrics
from omegaconf import DictConfig

from ptls.data_load.padded_batch import PaddedBatch

logger = logging.getLogger(__name__)


class SequenceToTarget(pl.LightningModule):

    def __init__(self,
                 seq_encoder: torch.nn.Module,
                 head: torch.nn.Module=None,
                 activation1=None,
                 loss: torch.nn.Module=None,
                 metric_list: torchmetrics.Metric=None,
                 optimizer_partial=None,
                 lr_scheduler_partial=None,
                 pretrained_lr=None,
                 train_update_n_steps=None,
                 ):
        super().__init__()

        self.save_hyperparameters(ignore=[
            'seq_encoder', 'head', 'loss', 'metric_list', 'optimizer_partial', 'lr_scheduler_partial', 'activation1'])
        self.activation = activation1
        self.seq_encoder = seq_encoder
        self.head = head
        self.loss = loss

        if type(metric_list) is dict or type(metric_list) is DictConfig:
            metric_list = [(k, v) for k, v in metric_list.items()]
        else:
            if type(metric_list) is not list:
                metric_list = [metric_list]
            metric_list = [(m.__class__.__name__, m) for m in metric_list]

        self.train_metrics = torch.nn.ModuleDict([(name, deepcopy(mc)) for name, mc in metric_list])
        self.valid_metrics = torch.nn.ModuleDict([(name, deepcopy(mc)) for name, mc in metric_list])

        self.optimizer_partial = optimizer_partial
        self.lr_scheduler_partial = lr_scheduler_partial
        self.val_loss = []

    def forward(self, x):
        x = self.seq_encoder(x)
        x = x.payload
        if self.head is not None:
            x = self.head(x)
        if self.activation is not None:
            x = self.activation(x)
        return x

    def training_step(self, batch, _):
        x, y = batch
        y_h = self(x)
        mask = x.seq_len_mask
        loss = self.loss(y_h[mask.bool()].unsqueeze(0), y.payload[mask.bool()].unsqueeze(0)).mean()
        self.log('train_loss', loss, batch_size = 256)
        return loss


    def validation_step(self, batch, _):
        x, y = batch
        y_h = self(x)
        mask = x.seq_len_mask
        y_valid = y.payload[mask.bool()].unsqueeze(0)
        y_valid[:, 1] = 1
        loss = self.loss(y_h[mask.bool()].unsqueeze(0), y_valid).mean()
        self.log('val_loss', loss, batch_size = 256)
   
    def configure_optimizers(self):
        if self.hparams.pretrained_lr is not None:
            if self.hparams.pretrained_lr == 'freeze':
                for p in self.seq_encoder.parameters():
                    p.requires_grad = False
                logger.info('Created optimizer with frozen encoder')
                parameters = self.parameters()
            else:
                parameters = [
                    {'params': self.seq_encoder.parameters(), 'lr': self.hparams.pretrained_lr},
                    {'params': self.head.parameters()},  # use predefined lr from `self.optimizer_partial`
                ]
                logger.info('Created optimizer with two lr groups')
        else:
            parameters = self.parameters()

        optimizer = self.optimizer_partial(parameters)
        scheduler = self.lr_scheduler_partial(optimizer)
        return [optimizer], [scheduler]



In [None]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.supervised import SeqToTargetDataset
from ptls.frames.supervised.metrics import R_squared
trx_encoder_params = dict(
    numeric_values={'transaction_amt': 'log',
                    'ones': 'identity',
                   },
    embeddings={
        'currency_rk': {'in': 5, 'out': 2},
        'mcc_code': {'in': 500, 'out': 16}
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=16,
    type='gru',
    is_reduce_sequence=False,
)

model = SequenceToTarget(
    seq_encoder=seq_encoder,
    head = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Linear(16, 2),),
    activation1 = WeibullActivation(init_alpha=7., max_beta=4.0),
    loss = WeibullCensoredNLLLoss(discrete=True),
    optimizer_partial=partial(torch.optim.AdamW, lr=0.0001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

# Data

In [None]:
import torch
from ptls.data_load.padded_batch import PaddedBatch
import numpy as np
import torch
from collections import defaultdict
from functools import reduce

from ptls.data_load.feature_dict import FeatureDict
def collate_feature_dict(batch):
    new_x_ = defaultdict(list)
    for i, x in enumerate(batch):
        for k, v in x.items():
            new_x_[k].append(v)
        assert reduce(
            lambda a, b: ((a[1] is not None and a[1] == b or a[1] is None) and a[0], b),
            map(len, new_x_.values()), (True, None))[0]

    seq_col = next(k for k, v in batch[0].items() if FeatureDict.is_seq_feature(k, v))
    lengths = torch.LongTensor([len(rec[seq_col]) for rec in batch])
    new_x = {}
    for k, v in new_x_.items():
        if type(v[0]) is torch.Tensor:
            new_x[k] = torch.nn.utils.rnn.pad_sequence(v, batch_first=True)
        elif type(v[0]) is np.ndarray:
            new_x[k] = v  # list of arrays[object]
        else:
            v = np.array(v)
            if v.dtype.kind == 'i':
                new_x[k] = torch.from_numpy(v).long()
            elif v.dtype.kind == 'f':
                new_x[k] = torch.from_numpy(v).float()
            else:
                new_x[k] = v
    return PaddedBatch(new_x, lengths)

class SeqToTargetDataset(torch.utils.data.Dataset):
    def __init__(self,
                 data,
                 target_col_name,
                 target_dtype=None,
                 *args, **kwargs,
                 ):
        super().__init__(*args, **kwargs)

        self.data = data
        self.target_col_name = target_col_name
        if type(target_dtype) is str:
            self.target_dtype = getattr(torch, target_dtype)
        else:
            self.target_dtype = target_dtype

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        feature_arrays = self.data[item]
        return self.sencore_last(feature_arrays)

    def __iter__(self):
        for feature_arrays in self.data:
            yield self.sencore_last(feature_arrays)
    def sencore_last(self, feature_arrays):
        if isinstance(self.target_col_name, list):
            target = feature_arrays[self.target_col_name[0]].to(torch.int32)
            last_tr_idx = np.max(np.where(target == target.min()))
            new_target = torch.cat((target[:last_tr_idx+1], target[last_tr_idx+1:] - target[-1]))
            new_is_sencored = torch.cat((torch.ones(last_tr_idx+1), torch.zeros(len(target) - last_tr_idx-1)))
            feature_arrays[self.target_col_name[0]] = new_target
            feature_arrays[self.target_col_name[1]] = new_is_sencored
            return feature_arrays
    def collate_fn(self, padded_batch):
        padded_batch = collate_feature_dict(padded_batch)
        if isinstance(self.target_col_name, list):
            targets = []
            for col in self.target_col_name:
                targets.append(padded_batch.payload[col])
                del padded_batch.payload[col]
            target = torch.stack(targets, dim = 1).permute(0, 2, 1)
        else:
            target = padded_batch.payload[self.target_col_name]
            del padded_batch.payload[self.target_col_name]
        if self.target_dtype is not None:
            target = target.to(dtype=self.target_dtype)
        return padded_batch, PaddedBatch(target, padded_batch.seq_lens)


class SeqToTargetIterableDataset(SeqToTargetDataset, torch.utils.data.IterableDataset):
    pass

In [None]:
from ptls.data_load.augmentations import RandomSlice
class NewRandomSlice(RandomSlice):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    @staticmethod
    def is_seq_feature(k: str, x):
        if k == 'event_time':
            return True
        if type(x) in (np.ndarray, torch.Tensor):
            return True
        return False
    @staticmethod
    def seq_indexing(d, ix):
        return {k: v[ix] if NewRandomSlice.is_seq_feature(k, v) else v for k, v in d.items()}

    @staticmethod
    def get_seq_len(d):
        if 'event_time' in d:
            return len(d['event_time'])
        return len(next(v for k, v in d.items() if NewRandomSlice.is_seq_feature(k, v)))

In [None]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import AugmentationDataset
train_dl = PtlsDataModule(
    train_data=SeqToTargetDataset(
        AugmentationDataset(
            data = MemoryMapDataset(
                data=train,
                i_filters=[
                ],
            ),
            f_augmentations = [NewRandomSlice(20, 150)]),
        target_col_name = ['target', 'is_censored']
    ),
    valid_data=SeqToTargetDataset(
        MemoryMapDataset(
            data=valid,
            i_filters=[
            ],
        ),
        target_col_name = ['target', 'is_censored']
    ),
    train_num_workers=0,
    train_batch_size=12000,
    valid_num_workers=0,
    valid_batch_size=12000,
)

In [None]:
trainer = pl.Trainer(
    max_epochs=40,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

In [None]:
%%time
print(f'logger.version = {trainer.logger.version}')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

In [None]:
%%time
import tqdm
from ptls.data_load.datasets import inference_data_loader
import numpy as np

def pooling_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            x = model.seq_encoder.trx_encoder(batch.to(device)).payload
            out_max = torch.max(x, dim=1)[0]
            out_min = torch.min(x, dim=1)[0]
            out_mean = torch.mean(x, dim=1)
            out_std = torch.std(x, dim=1)
            features = torch.cat([out_max, out_min, out_mean, out_std], dim=1)      
            X += [features]
    return X

def embed_inference(model, dl, device='cuda:0'):
    
    model.to(device)
    X = []
    for batch in tqdm.tqdm(dl):
        with torch.no_grad():
            features = model.seq_encoder(batch.to(device)).payload
            x = model.head(features)
            features = []
            for i in range(0, 150, 30):
                x_30 = x[:, i:i+30]
                out_max = torch.max(x_30, dim=1)[0]
                out_min = torch.min(x_30, dim=1)[0]
                out_mean = torch.mean(x_30, dim=1)
                out_std = torch.std(x_30, dim=1)
                features.append(torch.cat([out_max, out_min, out_mean, out_std], dim=1)) 
            for i in range(150, 182, 5):
                x_30 = x[:, i:]
                out_max = torch.max(x_30, dim=1)[0]
                out_min = torch.min(x_30, dim=1)[0]
                out_mean = torch.mean(x_30, dim=1)
                out_std = torch.std(x_30, dim=1)
                features.append(torch.cat([out_max, out_min, out_mean, out_std], dim=1)) 
                
            features = torch.cat(features, dim = 1)
            features = torch.cat([features, x[:, -1]], dim = 1)
            X += [features]
    return X

In [None]:
model.seq_encoder.is_reduce_sequence = False

In [None]:
dl = inference_data_loader(tr_dataset, num_workers=0, batch_size=512)
df_ab = torch.vstack(embed_inference(model, dl)).cpu().numpy()

In [None]:
df_embeds = pd.DataFrame(df_ab, columns=[f"ab_emb_{e}" for e in range(df_ab.shape[1])])
df_embeds['user_id'] = pd.DataFrame(tr_dataset)['user_id']
df_embeds.to_csv('../embeddings/wtte_rnn.csv', index=False)