In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

from pathlib import Path
import pandas as pd
import pickle
import numpy as np
import shutil
# from tqdm import tqdm_notebook as tqdm

import torch.utils.data
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator, Engine
from ignite.metrics import Accuracy, Loss

from src.models import InsiderClassifier, LSTM_Encoder
from src.params import get_params
from src.dataset import CertDataset, create_data_loaders
from src.cnn_trainer import *

%load_ext autoreload
%autoreload 2

In [30]:
# output_dir = Path(r'C:\Users\Mvideo\Google Drive\Datasets\CERT_output')
# answers_dir = Path(r"C:/Users/Mvideo/Downloads/answers")

output_dir = Path(r'C:\Users\admin\Google Drive\Datasets\CERT_output')
answers_dir = Path(r"C:\Users\admin\Google Drive\Datasets\CERT\answers")
main_answers_file = answers_dir / "insiders.csv"

lstm_checkpoint = output_dir / 'checkpoints/lstm/final2-nll/final_model_3040.pth'
assert(lstm_checkpoint.is_file())

run_name = 'cnn/test6'
log_dir = output_dir / 'logs' / run_name
checkpoint_dir = output_dir / 'checkpoints' / run_name

# assert(not log_dir.is_dir())
# assert(not checkpoint_dir.is_dir())

if log_dir.is_dir():
    shutil.rmtree(log_dir)
if checkpoint_dir.is_dir():
    shutil.rmtree(checkpoint_dir)

In [3]:
actions, targets = CertDataset.prepare_dataset(output_dir / 'aggregated.pkl', main_answers_file, min_length=50, max_length=200)

In [4]:
cert_dataset = CertDataset(actions, targets)
train_loader, val_loader = create_data_loaders(cert_dataset, validation_split=0.3, random_seed=0, batch_size=128)

params = get_params()

device = 'cuda'

In [5]:
anomalous_num = targets.sum()
overall_num = targets.shape[0]
non_anomalous_num = overall_num - anomalous_num

weight_beta = 0.1
anomalous_weight = 1 + (non_anomalous_num - anomalous_num) / (weight_beta * non_anomalous_num)
anomalous_weight

In [35]:
weight = torch.tensor([1, anomalous_weight]).to(device).float()

tensor([ 1, 10], device='cuda:0')

In [38]:
model = InsiderClassifier(params['model'], lstm_checkpoint)
# criterion = nn.BCELoss()
criterion = nn.CrossEntropyLoss(weight=torch.tensor([1,10]).float().to(device))
optimizer = optim.Adam(model.parameters())

train_engine = create_supervised_trainer(
                                        model, optimizer, criterion, device=device,
                                        prepare_batch=prepare_batch,
                                        log_dir=(output_dir / 'logs' / 'cnn' / run_name).as_posix(),
                                        checkpoint_dir=output_dir / 'checkpoints' / 'cnn' / run_name
                                       )

val_engine = create_supervised_evaluator(
        model, device=device,
        prepare_batch=prepare_batch,
        metrics={},
        criterion=criterion,
        log_dir=log_dir.as_posix(),
)

@train_engine.on(Events.STARTED)
def log_training_results(trainer):
    print('Initial validation run:')
    val_engine.train_epoch = 0
    val_engine.run(val_loader)

@train_engine.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    print('Validation run:')
    val_engine.train_epoch = train_engine.state.epoch
    val_engine.run(val_loader)


In [None]:
train_engine.run(train_loader, max_epochs=3)

Initial validation run:


HBox(children=(FloatProgress(value=0.0, max=260.0), HTML(value='')))


Validation Results - Avg loss: 0.585796, Accuracy: 0.996334


HBox(children=(FloatProgress(value=0.0, max=607.0), HTML(value='')))


Validation run:


HBox(children=(FloatProgress(value=0.0, max=260.0), HTML(value='')))


Validation Results - Avg loss: 0.115570, Accuracy: 0.996334


HBox(children=(FloatProgress(value=0.0, max=607.0), HTML(value='')))

In [37]:
%debug

> [1;32mc:\anaconda3\envs\python3.6-torch\lib\site-packages\torch\nn\functional.py[0m(1871)[0;36mnll_loss[1;34m()[0m
[1;32m   1869 [1;33m                         .format(input.size(0), target.size(0)))
[0m[1;32m   1870 [1;33m    [1;32mif[0m [0mdim[0m [1;33m==[0m [1;36m2[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m-> 1871 [1;33m        [0mret[0m [1;33m=[0m [0mtorch[0m[1;33m.[0m[0m_C[0m[1;33m.[0m[0m_nn[0m[1;33m.[0m[0mnll_loss[0m[1;33m([0m[0minput[0m[1;33m,[0m [0mtarget[0m[1;33m,[0m [0mweight[0m[1;33m,[0m [0m_Reduction[0m[1;33m.[0m[0mget_enum[0m[1;33m([0m[0mreduction[0m[1;33m)[0m[1;33m,[0m [0mignore_index[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m   1872 [1;33m    [1;32melif[0m [0mdim[0m [1;33m==[0m [1;36m4[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m   1873 [1;33m        [0mret[0m [1;33m=[0m [0mtorch[0m[1;33m.[0m[0m_C[0m[1;33m.[0m[0m_nn[0m[1;33m.[0m[0mnll_loss2d[0m[1;

ipdb>  weight


tensor([ 1, 10], device='cuda:0')


ipdb>  exit


In [None]:
class_weights

# Prediction exploration

In [17]:
model.eval()

InsiderClassifier(
  (lstm_encoder): LSTM_Encoder(
    (lstm_encoder): LSTM(64, 40, num_layers=3, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5)
    (decoder): Linear(in_features=40, out_features=64, bias=True)
    (log_softmax): LogSoftmax()
  )
  (sigmoid): Sigmoid()
  (cnn_classifier): CNN_Classifier(
    (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (linear): Linear(in_features=32000, out_features=2, bias=True)
    (softmax): LogSoftmax()
  )
)

In [18]:
it = iter(val_loader)
for batch in it:
    x, y = prepare_batch(batch)
    if y.sum() > 0:
        break

In [19]:
ind = (y==1).nonzero()[0]
ind

tensor([61])

In [26]:
with torch.no_grad():
    res = model(x.to(device))[ind]
F.softmax(res)

  This is separate from the ipykernel package so we can avoid doing imports until


tensor([[0.9976, 0.0024]], device='cuda:0')