<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Arch" data-toc-modified-id="Arch-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Arch</a></span></li><li><span><a href="#Embedding-dim" data-toc-modified-id="Embedding-dim-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Embedding dim</a></span></li><li><span><a href="#Cat-embedding-dim" data-toc-modified-id="Cat-embedding-dim-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Cat embedding dim</a></span></li><li><span><a href="#Num-observations" data-toc-modified-id="Num-observations-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Num observations</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

sns.set_style("whitegrid", {'axes.grid' : False})

from tqdm.auto import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


In [5]:
from pytorch_metric_learning import losses, miners, distances, reducers, samplers
import torch.optim as optim
from sklearn.model_selection import train_test_split
import torch.nn as nn
import random
import lightgbm as lgb

In [6]:
from code.dataloader import AgeGroupMLDataset, AgeGroupClfDataset
from code.encoder_gru import Encoder
from code.decoder import Decoder
from code.classifier import Classifier
from code.utils import train_ml_model, train_classifier

In [7]:
BATCH_SIZE = 64 # BATCH_SIZE unique persons
NUM_OF_SUBSEQUENCES = 5
SUBSEQUENCE_LENGTH = 90

EMBEDDING_DIM = 256
LR = 0.002
NUM_EPOCHS = 50

cat_vocab_sizes = [204]
cat_embedding_dim = 102
num_input_dim = 4
NUM_OBS = 30000

In [8]:
arches = (
    ('GRU', nn.GRU(
                num_input_dim + cat_embedding_dim,
                EMBEDDING_DIM,
                batch_first=False),
    ),
    ('LSTM', nn.LSTM(
                num_input_dim + cat_embedding_dim,
                EMBEDDING_DIM,
                batch_first=False),
    ),
)

In [9]:
dataset = AgeGroupMLDataset(num_observations=NUM_OBS)

In [10]:
dataset.load_client_to_indices()

In [11]:
clfdataset = AgeGroupClfDataset()

In [12]:
clfdataset.load_client_to_indices()

In [13]:
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [14]:
targets = dataset.targets

dataloader = torch.utils.data.DataLoader(
    dataset, batch_size=BATCH_SIZE,
    num_workers=0,
)

targets = clfdataset.targets

train_idx, test_idx= train_test_split(
    np.arange(len(targets)),
    test_size=0.3,
    shuffle=True,
    stratify=targets,
    random_state=228
)

train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
test_sampler = torch.utils.data.SubsetRandomSampler(test_idx)

trainloader = torch.utils.data.DataLoader(
    clfdataset, batch_size=BATCH_SIZE,
    sampler=train_sampler)
testloader = torch.utils.data.DataLoader(
    clfdataset, batch_size=BATCH_SIZE,
    sampler=test_sampler)

## Arch

In [None]:
for (arch, cell) in tqdm(arches):
    print(arch)
    LR = 0.002
    
    # train decoder
    
    encoder = Encoder(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM,
    )
    encoder.sequence_encoder = cell
    encoder.to(device);
    optimizer = optim.Adam(encoder.parameters(), lr=LR)
    
    distance = distances.CosineSimilarity()
    reducer = reducers.ThresholdReducer(low = 0) # basically, returns average
    loss_func = losses.TripletMarginLoss(margin = 0.4, distance = distance, reducer = reducer)
    mining_func = miners.TripletMarginMiner(margin = 0.4, distance = distance, type_of_triplets = "semihard")
    
    train_losses = train_ml_model(
        encoder, NUM_EPOCHS, dataloader, NUM_OF_SUBSEQUENCES,
        mining_func, loss_func, optimizer)
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.plot(train_losses, label='train')
    plt.xlabel('iter')
    plt.ylabel('loss');
    plt.savefig(f'plots/ML_{arch}_{EMBEDDING_DIM}_{NUM_OBS}_{NUM_EPOCHS}.png')
    
    SCHEDULER_EPOCHS = 2
    LR = 0.002
    
    # train classifier decoder
    
    classifier = Classifier(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM
    )
    classifier.encoder = encoder
    classifier.freeze_encoder()
    classifier.to(device);
    
    optimizer = optim.Adam(classifier.decoder.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=SCHEDULER_EPOCHS,
    )
    
    train_losses, train_accuracy, val_losses, val_accuracy = train_classifier(
        classifier, NUM_EPOCHS, trainloader, testloader,
        optimizer, criterion, scheduler,
        enable_train_mode = lambda: classifier.decoder.train(),
        enable_test_mode = lambda: classifier.decoder.eval(),
    )
    
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='train')
    plt.plot(val_losses, label='validation')
    plt.xlabel('iter')
    plt.ylabel('loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.xlabel('iter')
    plt.ylabel('accuracy');
    plt.plot(train_accuracy, label='train')
    plt.plot(val_accuracy, label='validation')
    plt.legend()

    plt.savefig(f'plots/clfdec_{arch}_{EMBEDDING_DIM}_{NUM_OBS}_{NUM_EPOCHS}.png')

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache() 

## Embedding dim

In [None]:
arch = 'GRU'

dims = [32, 64, 128, 256, 512, 1024]
accs = []

for EMBEDDING_DIM in tqdm(dims):
    print(EMBEDDING_DIM)
    LR = 0.002
    
    # train decoder
    
    encoder = Encoder(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM,
    )
    encoder.to(device);
    encoder.train()
    optimizer = optim.Adam(encoder.parameters(), lr=LR)
    
    distance = distances.CosineSimilarity()
    reducer = reducers.ThresholdReducer(low = 0) # basically, returns average
    loss_func = losses.TripletMarginLoss(margin = 0.4, distance = distance, reducer = reducer)
    mining_func = miners.TripletMarginMiner(margin = 0.4, distance = distance, type_of_triplets = "semihard")
    
    train_losses = train_ml_model(
        encoder, NUM_EPOCHS, dataloader, NUM_OF_SUBSEQUENCES,
        mining_func, loss_func, optimizer)
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.plot(train_losses, label='train')
    plt.xlabel('iter')
    plt.ylabel('loss');
    plt.savefig(f'plots/ML_{arch}_{EMBEDDING_DIM}_{NUM_OBS}_{NUM_EPOCHS}.png')
    
    SCHEDULER_EPOCHS = 2
    LR = 0.002
    
    # train classifier decoder
    
    classifier = Classifier(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM
    )
    classifier.encoder = encoder
    classifier.freeze_encoder()
    classifier.to(device);
    
    optimizer = optim.Adam(classifier.decoder.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=SCHEDULER_EPOCHS,
    )
    
    train_losses, train_accuracy, val_losses, val_accuracy = train_classifier(
        classifier, NUM_EPOCHS, trainloader, testloader,
        optimizer, criterion, scheduler,
        enable_train_mode = lambda: classifier.decoder.train(),
        enable_test_mode = lambda: classifier.decoder.eval(),
    )
    
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='train')
    plt.plot(val_losses, label='validation')
    plt.xlabel('iter')
    plt.ylabel('loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.xlabel('iter')
    plt.ylabel('accuracy');
    plt.plot(train_accuracy, label='train')
    plt.plot(val_accuracy, label='validation')
    plt.legend()

    plt.savefig(f'plots/clfdec_{arch}_{EMBEDDING_DIM}_{NUM_OBS}_{NUM_EPOCHS}.png')
    
    accs.append(val_accuracy[-1])

In [None]:
plt.plot(dims, accs)
plt.xlabel('Embedding dimension')
plt.ylabel('accuracy');
plt.savefig(f'plots/clfdec_{arch}_embedding_to_acc_{NUM_OBS}_{NUM_EPOCHS}.png')

## Cat embedding dim

In [None]:
arch = 'GRU'

EMBEDDING_DIM = 256
NUM_EPOCHS=20
dims = [20, 40, 60, 80, 100]
accs = []

for cat_embedding_dim in tqdm(dims):
    print(cat_embedding_dim)
    LR = 0.002
    
    # train decoder
    
    encoder = Encoder(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM,
    )
    encoder.to(device);
    encoder.train()
    optimizer = optim.Adam(encoder.parameters(), lr=LR)
    
    distance = distances.CosineSimilarity()
    reducer = reducers.ThresholdReducer(low = 0) # basically, returns average
    loss_func = losses.TripletMarginLoss(margin = 0.4, distance = distance, reducer = reducer)
    mining_func = miners.TripletMarginMiner(margin = 0.4, distance = distance, type_of_triplets = "semihard")
    
    train_losses = train_ml_model(
        encoder, NUM_EPOCHS, dataloader, NUM_OF_SUBSEQUENCES,
        mining_func, loss_func, optimizer)
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.plot(train_losses, label='train')
    plt.xlabel('iter')
    plt.ylabel('loss');
    plt.title(f'{cat_embedding_dim}')
    
    SCHEDULER_EPOCHS = 2
    LR = 0.002
    
    # train classifier decoder
    
    classifier = Classifier(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM
    )
    classifier.encoder = encoder
    classifier.freeze_encoder()
    classifier.to(device);
    
    optimizer = optim.Adam(classifier.decoder.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=SCHEDULER_EPOCHS,
    )
    
    train_losses, train_accuracy, val_losses, val_accuracy = train_classifier(
        classifier, NUM_EPOCHS, trainloader, testloader,
        optimizer, criterion, scheduler,
        enable_train_mode = lambda: classifier.decoder.train(),
        enable_test_mode = lambda: classifier.decoder.eval(),
    )
    
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='train')
    plt.plot(val_losses, label='validation')
    plt.xlabel('iter')
    plt.ylabel('loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.xlabel('iter')
    plt.ylabel('accuracy');
    plt.plot(train_accuracy, label='train')
    plt.plot(val_accuracy, label='validation')
    plt.legend()
    
    accs.append(val_accuracy[-1])

  0%|          | 0/5 [00:00<?, ?it/s]

20


  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
plt.plot(dims, accs)
plt.xlabel('categorical embedding dimension')
plt.ylabel('accuracy');
plt.savefig(f'plots/clfdec_{arch}_cat_embedding_to_acc_{NUM_OBS}_{NUM_EPOCHS}.png')

In [None]:
dataset = AgeGroupMLDataset()
dataset.load_client_to_indices()
clfdataset = AgeGroupClfDataset()
clfdataset.load_client_to_indices()

## Num observations

In [None]:
arch = 'GRU'

EMBEDDING_DIM = 256
nums_epochs = [300, 200, 200, 100, 100, 75, 50]
nums_obs = [300, 600, 1300, 2700, 5400, 10800, 21600]
nums_epochs = nums_epochs[::-1]
nums_obs = nums_obs[::-1]
accs = []

for NUM_OBS, NUM_EPOCHS in tqdm(zip(nums_obs, nums_epochs)):
    import gc
    gc.collect()
    torch.cuda.empty_cache()
    print(NUM_OBS, NUM_EPOCHS)
    
    dataset.targets = dataset.targets[:NUM_OBS]
    clfdataset.targets = clfdataset.targets[:NUM_OBS]

    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)
    
    dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=BATCH_SIZE,
        num_workers=0,
    )

    train_idx, test_idx= train_test_split(
        np.arange(len(clfdataset.targets)),
        test_size=0.3,
        shuffle=True,
        stratify=clfdataset.targets,
        random_state=228
    )

    train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_sampler = torch.utils.data.SubsetRandomSampler(test_idx)

    trainloader = torch.utils.data.DataLoader(
        clfdataset, batch_size=BATCH_SIZE,
        sampler=train_sampler)
    testloader = torch.utils.data.DataLoader(
        clfdataset, batch_size=BATCH_SIZE,
        sampler=test_sampler)
    
    LR = 0.002
    
    # train decoder
    
    encoder = Encoder(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM,
    )
    encoder.to(device);
    encoder.train()
    optimizer = optim.Adam(encoder.parameters(), lr=LR)
    
    distance = distances.CosineSimilarity()
    reducer = reducers.ThresholdReducer(low = 0) # basically, returns average
    loss_func = losses.TripletMarginLoss(margin = 0.4, distance = distance, reducer = reducer)
    mining_func = miners.TripletMarginMiner(margin = 0.4, distance = distance, type_of_triplets = "semihard")
    
    train_losses = train_ml_model(
        encoder, NUM_EPOCHS, dataloader, NUM_OF_SUBSEQUENCES,
        mining_func, loss_func, optimizer)
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.plot(train_losses, label='train')
    plt.xlabel('iter')
    plt.ylabel('loss');
    plt.savefig(f'plots/ML_{arch}_{EMBEDDING_DIM}_{NUM_OBS}_{NUM_EPOCHS}.png')
    
    SCHEDULER_EPOCHS = 2
    LR = 0.002
    
    # train classifier decoder
    
    classifier = Classifier(
        numerical_input_dim=num_input_dim,
        cat_vocab_sizes=cat_vocab_sizes,
        cat_embedding_dim=cat_embedding_dim,
        embedding_dim=EMBEDDING_DIM
    )
    classifier.encoder = encoder
    classifier.freeze_encoder()
    classifier.to(device);
    
    optimizer = optim.Adam(classifier.decoder.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=SCHEDULER_EPOCHS,
    )
    
    train_losses, train_accuracy, val_losses, val_accuracy = train_classifier(
        classifier, NUM_EPOCHS, trainloader, testloader,
        optimizer, criterion, scheduler,
        enable_train_mode = lambda: classifier.decoder.train(),
        enable_test_mode = lambda: classifier.decoder.eval(),
    )
    
    fig, axs = plt.subplots(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='train')
    plt.plot(val_losses, label='validation')
    plt.xlabel('iter')
    plt.ylabel('loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.xlabel('iter')
    plt.ylabel('accuracy');
    plt.plot(train_accuracy, label='train')
    plt.plot(val_accuracy, label='validation')
    plt.legend()

    plt.savefig(f'plots/clfdec_{arch}_{EMBEDDING_DIM}_{NUM_OBS}_{NUM_EPOCHS}.png')
    
    accs.append(val_accuracy[-1])

In [None]:
dims = [32, 64, 128, 256, 512, 1024, 2048]

accs = [0.5453333258628845,
 0.558555543422699,
 0.558222234249115,
 0.5707777738571167,
 0.5681111216545105,
 0.5681111216545105,
 0.45866668224334717]

In [None]:
plt.plot(dims, accs)
plt.xlabel('Embedding dimension')
plt.xscale('log', base=2)
plt.xticks(dims)
plt.ylabel('accuracy');
plt.savefig(f'plots/clfdec_{arch}_embedding_to_acc_{NUM_OBS}_{NUM_EPOCHS}.png')

In [None]:
arch = 'GRU'

In [None]:
sns.set_style("whitegrid")

In [None]:
# sns.lineplot(x=dims, y=accs)
plt.plot(dims, accs)
plt.scatter(dims, accs)
plt.xlabel('Embedding size')
plt.xscale('log', base=2)
plt.xticks(dims)
plt.ylabel('accuracy');
plt.savefig(f'plots/clfdec_{arch}_embedding_to_acc_{NUM_OBS}_{NUM_EPOCHS}.png')

In [None]:
encoder.eval();

In [None]:
embeddings_train = torch.zeros((1, EMBEDDING_DIM))[1:].to(device)

for (sequences, labels) in trainloader:
    with torch.no_grad():
        n, c = sequences[0], sequences[1]
        n = n.to(device)
        c = c.to(device)
        labels = labels.to(device)

        embedding = encoder(n, c)
        
        embeddings_train = torch.cat((embeddings_train, embedding))
        
embeddings_test = torch.zeros((1, EMBEDDING_DIM))[1:].to(device)

for (sequences, labels) in testloader:
    with torch.no_grad():
        n, c = sequences[0], sequences[1]
        n = n.to(device)
        c = c.to(device)
        labels = labels.to(device)

        embedding = encoder(n, c)
        
        embeddings = torch.cat((embeddings_test, embedding))

In [None]:
embeddings_train = embeddings_train.to('cpu')
embeddings_test = embeddings_test.to('cpu')

In [None]:
X_train = embeddings_train
X_test = embeddings_test
y_train = targets[train_idx]
y_test = targets[test_idx]

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)

In [None]:
from catboost import CatBoostClassifier, Pool
from catboost import cv
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=228, stratify=y)

train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [None]:
model = CatBoostClassifier(
    iterations=500,
    depth=3,
    learning_rate=0.5,
    verbose=True,
    eval_metric='Accuracy',
    auto_class_weights='Balanced',
    random_state=228,
)

In [None]:
model.fit(train_pool, eval_set=test_pool)

In [None]:
train_acc = model.evals_result_['learn']['Accuracy']
valid_acc = model.evals_result_['validation']['Accuracy']

plt.plot(train_acc, label='train')
plt.plot(valid_acc, label='valid')
plt.xlabel('iter')
plt.ylabel('accuracy')
plt.legend();

In [None]:
sum(np.squeeze(model.predict(X_test)) == y_test) / len(y_test)