In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

from pathlib import Path
import pandas as pd
import pickle
import numpy as np
import shutil
from tqdm.notebook import tqdm

import torch.utils.data
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator, Engine
from ignite.metrics import Accuracy, Loss

from src.models import InsiderClassifier, LSTM_Encoder
from src.params import get_params
from src.dataset import CertDataset, create_data_loaders
from src.trainer import *

%load_ext autoreload
%autoreload 2

# TODO:

* Переместить деление на трейн-тест
* Считать точность и лосс только по непаддинговым токенам


In [7]:
# output_dir = Path(r'C:\Users\Mvideo\Google Drive\Datasets\CERT_output')
# answers_dir = Path(r"C:/Users/Mvideo/Downloads/answers")

output_dir = Path(r'C:\Users\admin\Google Drive\Datasets\CERT_output')
answers_dir = Path(r"C:\Users\admin\Google Drive\Datasets\CERT\answers")
main_answers_file = answers_dir / "insiders.csv"

assert(output_dir.is_dir())
assert(answers_dir.is_dir())

run_name = 'lstm/final2-nll'

log_dir = output_dir / 'logs' / run_name
checkpoint_dir = output_dir / 'checkpoints' / run_name

# assert(not log_dir.is_dir())
# assert(not checkpoint_dir.is_dir())

if log_dir.is_dir():
    shutil.rmtree(log_dir)
if checkpoint_dir.is_dir():
    shutil.rmtree(checkpoint_dir)

In [8]:
actions, targets = CertDataset.prepare_dataset(output_dir / 'aggregated.pkl', main_answers_file, min_length=50, max_length=200)

# Train

TODO:
* Эксперименты с LR Scheduling

In [9]:
cert_dataset = CertDataset(actions, targets)
train_loader, val_loader = create_data_loaders(cert_dataset, validation_split=0.3, random_seed=0, batch_size=256)

params = get_params()

device = 'cuda'

In [10]:
lstm_encoder = LSTM_Encoder(params['model']['lstm_encoder'])
criterion = nn.NLLLoss()
optimizer = optim.Adam(lstm_encoder.parameters())

train_engine = create_supervised_trainer_lstm(
                                        lstm_encoder, optimizer, criterion, device=device,
                                        prepare_batch=prepare_batch_lstm,
                                        log_dir=log_dir.as_posix(),
                                        checkpoint_dir=checkpoint_dir,
                                        checkpoint_every=500,
                                        tensorboard_every=10,
                                       )

val_engine = create_supervised_evaluator_lstm(
        lstm_encoder, device=device,
        prepare_batch=prepare_batch_lstm,
        metrics={},
        criterion=criterion,
        log_dir=log_dir.as_posix(),
)

@train_engine.on(Events.STARTED)
def log_training_results(trainer):
    print('Initial validation run:')
    val_engine.train_epoch = 0
    val_engine.run(val_loader)

@train_engine.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    print('Validation run:')
    val_engine.train_epoch = train_engine.state.epoch
    val_engine.run(val_loader)


In [11]:
train_engine.run(train_loader, max_epochs=10)

Initial validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 4.208617, Accuracy: 0.000169


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.276628, Accuracy: 0.943530


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.191461, Accuracy: 0.958307


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.158132, Accuracy: 0.965107


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.144232, Accuracy: 0.967514


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.137375, Accuracy: 0.968337


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.132151, Accuracy: 0.968692


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.128030, Accuracy: 0.968885


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.123955, Accuracy: 0.969337


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.120761, Accuracy: 0.970025


HBox(children=(FloatProgress(value=0.0, max=304.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=130.0), HTML(value='')))


Validation Results - Avg loss: 0.118030, Accuracy: 0.970681


State:
	iteration: 3040
	epoch: 10
	epoch_length: 304
	max_epochs: 10
	output: <class 'dict'>
	batch: <class 'dict'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: 12

# Prediction exploration

TODO:
- decode class numbers

In [12]:
batch = next(iter(val_loader))

In [15]:
x, y = prepare_batch_lstm(batch)

In [19]:
lstm_encoder.train()

LSTM_Encoder(
  (lstm_encoder): LSTM(64, 40, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (decoder): Linear(in_features=40, out_features=64, bias=True)
  (log_softmax): LogSoftmax()
)

In [38]:
y.argmax(dim=2)[2]

tensor([54, 54, 54, 54, 54, 54, 54, 54, 62, 54, 54, 44,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])

In [39]:
lstm_encoder(x.to('cuda')).exp().argmax(dim=2)[2]

tensor([54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0], device='cuda:0')

In [37]:
lstm_encoder(x.to('cuda')).exp().argmax(dim=2).unique()

tensor([ 0, 38, 43, 44, 48, 54, 59, 60], device='cuda:0')