based on https://arxiv.org/pdf/2002.08232.pdf

In [1]:
!pip install pytorch-metric-learning > /dev/null

# probably not needed
# although can try knn
!pip install faiss-gpu > /dev/null

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

sns.set_style("whitegrid", {'axes.grid' : False})

# from tqdm.notebook import tqdm
from tqdm.auto import tqdm

# cuda

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


# Data

In [5]:
from pathlib import Path

In [6]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
data_path = Path('/content/gdrive/MyDrive/courses/hse/third year/project/data age group/data.zip')

In [8]:
! cp '{data_path}' .

In [9]:
!ls

data.zip  gdrive  sample_data


In [10]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/test.csv           
   creating: __MACOSX/
   creating: __MACOSX/data/
  inflating: __MACOSX/data/._test.csv  
  inflating: data/small_group_description.csv  
  inflating: __MACOSX/data/._small_group_description.csv  
  inflating: data/train_target.csv   
  inflating: __MACOSX/data/._train_target.csv  
  inflating: data/transactions_train.csv  
  inflating: __MACOSX/data/._transactions_train.csv  
  inflating: data/transactions_test.csv  
  inflating: __MACOSX/data/._transactions_test.csv  


## Dataloader

In [11]:
import torch

In [12]:
def normalize(col):
    return (col - col.mean())/col.std()

def process_transactions(df):
    df = df.copy()
    df['weekday'] = df['trans_date'] % 7
    df.drop(['trans_date'], axis=1, inplace=True)
    for c in ['amount_rur', 'small_group', 'weekday']:
        df[c] = normalize(df[c])
    return df

In [13]:
import cv2
import os
from pathlib import Path

In [14]:
NUM_OF_SUBSEQUENCES = 5 # number of sequences per person
SUBSEQUENCE_LENGTH = 90 # should be random??? mean number of transactions per sample is 90

class AgeDataset(torch.utils.data.Dataset):
    def __init__(self, root=Path('./data/')):
        super().__init__()
        raw_df = pd.read_csv(root/'transactions_train.csv')
        raw_df = raw_df.head(2000000) # testing
        self.df = process_transactions(raw_df)

        target_df = pd.read_csv(root/'train_target.csv')
        self.target_df = target_df

        self.clients = self.df['client_id'].unique()

        client_to_seq = {}
        # build client_to_sequence
        print("building client to sequence")
        for c in tqdm(self.clients):
            sequence = self.get_sequence(c)
            client_to_seq[c] = sequence
        self.client_to_seq = client_to_seq

    def __getitem__(self, idx: int):
        # output shape: (NUM_OF_SUBSEQUENCES x SUBSEQUENCE_LENGTH x 3, client_id)
        # idx = client_id
        # should select N sequences during batch
        # for each N produce K subsequences

        client_id = self.clients[idx]
        sequence = self.client_to_seq[client_id]

        subsequences = torch.zeros((NUM_OF_SUBSEQUENCES, SUBSEQUENCE_LENGTH, 3))
        seq_len = len(sequence)
        for i in range(NUM_OF_SUBSEQUENCES):
            start_index = np.random.randint(0, seq_len-SUBSEQUENCE_LENGTH+1)
            end_index = start_index + SUBSEQUENCE_LENGTH
            subseq = torch.from_numpy(sequence[start_index:end_index])
            subsequences[i] = subseq

        return subsequences, client_id

    def __len__(self):
        return len(self.clients)
    

    def get_sequence(self, client_id):
        sequence = self.df[self.df['client_id'] == client_id]
        sequence = sequence.drop(['client_id'], axis=1)
        sequence = sequence.to_numpy()
        return sequence

# Net

https://arxiv.org/pdf/1911.02496.pdf

In [15]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [16]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim

        self.event_encoder = nn.Sequential(
            nn.Linear(input_dim, 24),
            nn.Sigmoid(),
            nn.BatchNorm1d(24),
            nn.Linear(24, 24),
            nn.Sigmoid(),
            nn.BatchNorm1d(24),
            nn.Linear(24, embedding_dim),
            nn.Sigmoid(),
        )

        # receives 
        self.sequence_encoder = nn.Sequential(
            nn.LSTM(embedding_dim, embedding_dim, batch_first=False)
        )


    def forward(self, x):
        # receives BATCH_SIZE*NUM_OF_SEQUENCES*SUBSEQUENCE_LENGTH*input_dim
        x = x.view(-1, self.input_dim)
        x = self.event_encoder(x)
        x = x.view(-1, SUBSEQUENCE_LENGTH, self.embedding_dim)

        # receives BATCH_SIZE*SUBSEQUENCE_LENGTH*embedding_dim
        x = torch.transpose(x, 0, 1) # so that its (seq_len, batch, input_size)
        x = self.sequence_encoder(x)[0][0]

        return x

# Train embedding

In [17]:
from pytorch_metric_learning import losses, miners, distances, reducers, samplers

In [18]:
BATCH_SIZE = 64 # 64 unique persons
SAMPLES_PER_CLASS = 1 # drawing 64 classes per batch

EMBEDDING_DIM = 32
LR = 0.002
NUM_EPOCHS = 150

In [19]:
dataset = AgeDataset()

building client to sequence


HBox(children=(FloatProgress(value=0.0, max=2292.0), HTML(value='')))




In [20]:
targets = dataset.clients

In [21]:
sampler = samplers.MPerClassSampler(targets, SAMPLES_PER_CLASS, batch_size=BATCH_SIZE, length_before_new_iter=BATCH_SIZE * 100) # 100 batches per epoch

In [22]:
dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=BATCH_SIZE,
    num_workers=0,
    sampler=sampler,
)

In [23]:
# check
dataiter = iter(dataloader)
sequences, labels = dataiter.next()

# should be BATCH_SIZExNUM_OF_SEQUNCESxSUBSEQUENCE_LENGTHx(num_features)
sequences.shape

torch.Size([64, 5, 90, 3])

In [24]:
encoder = Encoder(3, embedding_dim=EMBEDDING_DIM)
encoder.to(device);

In [25]:
optimizer = optim.Adam(encoder.parameters(), lr=LR)

In [26]:
distance = distances.CosineSimilarity()
reducer = reducers.ThresholdReducer(low = 0)
loss_func = losses.TripletMarginLoss(margin = 0.4, distance = distance, reducer = reducer)
mining_func = miners.TripletMarginMiner(margin = 0.4, distance = distance, type_of_triplets = "semihard")

In [27]:
encoder.train()

for epoch in tqdm(range(NUM_EPOCHS)):
    for batch_idx, (sequences, labels) in enumerate(dataloader):
        data, labels = sequences.to(device), labels.to(device)

        labels = torch.repeat_interleave(labels, NUM_OF_SUBSEQUENCES)

        embeddings = encoder(data)
        indices_tuple = mining_func(embeddings, labels)
        loss = loss_func(embeddings, labels, indices_tuple)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # if batch_idx % 50 == 0:
        #     print("Epoch {} Iteration {}: Loss = {}, Number of mined triplets = {}".format(epoch, batch_idx, loss, mining_func.num_triplets))
    print("Epoch {} Loss = {}".format(epoch, loss.item()))

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))

Epoch 0 Loss = 0.22396230697631836
Epoch 1 Loss = 0.2269313484430313
Epoch 2 Loss = 0.22890253365039825
Epoch 3 Loss = 0.22736293077468872
Epoch 4 Loss = 0.22982054948806763
Epoch 5 Loss = 0.22959378361701965
Epoch 6 Loss = 0.2292843759059906
Epoch 7 Loss = 0.22917674481868744
Epoch 8 Loss = 0.22767266631126404
Epoch 9 Loss = 0.22618207335472107
Epoch 10 Loss = 0.22538763284683228
Epoch 11 Loss = 0.22381441295146942
Epoch 12 Loss = 0.2183837890625
Epoch 13 Loss = 0.22646164894104004
Epoch 14 Loss = 0.21862339973449707
Epoch 15 Loss = 0.219980850815773
Epoch 16 Loss = 0.21996517479419708
Epoch 17 Loss = 0.21875426173210144
Epoch 18 Loss = 0.22144804894924164
Epoch 19 Loss = 0.22289039194583893
Epoch 20 Loss = 0.2187654823064804
Epoch 21 Loss = 0.22296865284442902
Epoch 22 Loss = 0.22091491520404816
Epoch 23 Loss = 0.21482014656066895
Epoch 24 Loss = 0.2198544293642044
Epoch 25 Loss = 0.21616433560848236
Epoch 26 Loss = 0.2170671820640564
Epoch 27 Loss = 0.21939699351787567
Epoch 28 Loss

# Predict

In [28]:
m = encoder.to('cpu').eval()

In [29]:
# check
dataiter = iter(dataloader)
sequences, labels = dataiter.next()

# should be BATCH_SIZExNUM_OF_SEQUNCESxSUBSEQUENCE_LENGTHx(num_features)
sequences.shape

torch.Size([64, 5, 90, 3])

In [30]:
embeddings = torch.zeros((1, EMBEDDING_DIM))[1:]

for client_id in tqdm(sorted(dataset.clients)):
    s = dataset.client_to_seq[client_id]
    s = torch.from_numpy(s).type(torch.FloatTensor)

    with torch.no_grad():
        encoded_events = m.event_encoder(s)
        encoded_events = encoded_events.view(1, -1, EMBEDDING_DIM)
        encoded_events = torch.transpose(encoded_events, 0, 1) # so that its (seq_len, batch, input_size)
        embedding = m.sequence_encoder(encoded_events)[0][0]
        embeddings = torch.cat((embeddings, embedding))


embeddings.shape

HBox(children=(FloatProgress(value=0.0, max=2292.0), HTML(value='')))




torch.Size([2292, 32])

In [31]:
X = embeddings.numpy()
y = dataset.target_df[dataset.target_df['client_id'].isin(dataset.clients)].sort_values('client_id')['bins'].to_numpy()

In [32]:
!pip install catboost > /dev/null

In [33]:
from catboost import CatBoostClassifier, Pool
from catboost import cv
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=228, stratify=y)

train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [35]:
model = CatBoostClassifier(
    iterations=100,
    verbose=True,
    eval_metric='Accuracy',
)

In [36]:
model.fit(train_pool, eval_set=test_pool)

Learning rate set to 0.262452
0:	learn: 0.4723127	test: 0.4398943	best: 0.4398943 (0)	total: 80.4ms	remaining: 7.96s
1:	learn: 0.4833876	test: 0.4438573	best: 0.4438573 (1)	total: 105ms	remaining: 5.16s
2:	learn: 0.4859935	test: 0.4412153	best: 0.4438573 (1)	total: 130ms	remaining: 4.2s
3:	learn: 0.4905537	test: 0.4451783	best: 0.4451783 (3)	total: 155ms	remaining: 3.72s
4:	learn: 0.4970684	test: 0.4372523	best: 0.4451783 (3)	total: 183ms	remaining: 3.47s
5:	learn: 0.5009772	test: 0.4359313	best: 0.4451783 (3)	total: 214ms	remaining: 3.36s
6:	learn: 0.5146580	test: 0.4464993	best: 0.4464993 (6)	total: 240ms	remaining: 3.19s
7:	learn: 0.5218241	test: 0.4425363	best: 0.4464993 (6)	total: 265ms	remaining: 3.05s
8:	learn: 0.5211726	test: 0.4398943	best: 0.4464993 (6)	total: 292ms	remaining: 2.95s
9:	learn: 0.5211726	test: 0.4425363	best: 0.4464993 (6)	total: 316ms	remaining: 2.85s
10:	learn: 0.5250814	test: 0.4385733	best: 0.4464993 (6)	total: 341ms	remaining: 2.76s
11:	learn: 0.5296417	te

<catboost.core.CatBoostClassifier at 0x7fa0105eee80>

In [37]:
sum(np.squeeze(model.predict(X_test)) == y_test) / len(y_test)

0.4464993394980185