In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

from pathlib import Path
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm_notebook as tqdm

import torch.utils.data
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator, Engine
from ignite.metrics import Accuracy, Loss

from src.batcher import CertBatcher
from src.models import InsiderClassifier, LSTM_Encoder
from src.params import get_params

%load_ext autoreload
%autoreload 2

In [2]:
# output_dir = Path(r'C:\Users\Mvideo\Google Drive\Datasets\CERT_output')
# answers_dir = Path(r"C:/Users/Mvideo/Downloads/answers")

output_dir = Path(r'C:\Users\admin\Google Drive\Datasets\CERT_output')
answers_dir = Path(r"C:\Users\admin\Google Drive\Datasets\CERT\answers")

main_answers_file = answers_dir / "insiders.csv"

In [3]:
df = pd.read_pickle(output_dir / 'aggregated.pkl')
df = df.reset_index().dropna()
df.head()

Unnamed: 0,user,day,action_id
2,AAE0190,2010-01-04,"[61, 57, 54, 54, 54, 62, 62, 62, 54, 62, 62, 6..."
3,AAE0190,2010-01-05,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5..."
4,AAE0190,2010-01-06,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5..."
5,AAE0190,2010-01-07,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5..."
6,AAE0190,2010-01-08,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 6..."


In [4]:
MIN_LENGTH = 50
MAX_LENGTH = 200

In [5]:
main_df = pd.read_csv(main_answers_file)
main_df = main_df[main_df['dataset'] == 4.2].drop(['dataset', 'details'], axis=1)

main_df['start'] = pd.to_datetime(main_df['start'], format='%m/%d/%Y %H:%M:%S')
main_df['end'] = pd.to_datetime(main_df['end'], format='%m/%d/%Y %H:%M:%S')

df = df.merge(main_df, left_on='user', right_on='user', how='left')
df['malicious'] = (df.day >= df.start) & (df.day <= df.end)
df = df.drop(['start', 'end', 'day', 'user'], axis=1)

df['action_length'] = df.action_id.apply(len)

df = df[df.action_length < MIN_LENGTH]

df['action_id'] = df.action_id.apply(lambda x: x[:MAX_LENGTH])
df['action_id'] = df.action_id.apply(lambda x: x + [0] * (MAX_LENGTH - len(x)))

In [6]:
df.head(20)

Unnamed: 0,action_id,scenario,malicious,action_length
346,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 62, 5...",2.0,False,34
347,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 58, 6...",2.0,False,39
348,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5...",2.0,False,40
349,"[61, 54, 54, 54, 58, 48, 54, 59, 54, 54, 63, 5...",2.0,False,39
350,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 58, 5...",2.0,False,45
351,"[61, 54, 54, 54, 54, 54, 54, 54, 63, 54, 58, 5...",2.0,False,36
352,"[61, 54, 54, 54, 54, 63, 54, 54, 54, 58, 48, 5...",2.0,False,38
353,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5...",2.0,False,37
354,"[61, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 5...",2.0,False,41
355,"[61, 62, 54, 54, 54, 54, 54, 54, 55, 54, 54, 5...",2.0,False,37


In [7]:
actions = np.vstack(df.action_id.values)
malicious = df.malicious.values

# Train

In [8]:
batcher = CertBatcher(actions, malicious)

params = get_params()

device = 'cuda'

In [9]:
from src.trainer import *
from ignite.metrics import Accuracy, Loss, RunningAverage, Metric
from ignite.contrib.handlers.tensorboard_logger import *
from ignite.contrib.handlers import ProgressBar

%load_ext autoreload
%autoreload 2

lstm_encoder = LSTM_Encoder(params['lstm_encoder'])
lstm_encoder.to(device)
lstm_encoder.train()

criterion = nn.BCELoss()

optimizer = optim.Adam(lstm_encoder.parameters())

# engine = create_supervised_trainer(lstm_encoder, optimizer, criterion, device='cuda',
#                                    metrics={'loss': Loss(criterion)})

engine = create_supervised_trainer(lstm_encoder, optimizer, criterion, device=device,
                                  prepare_batch=prepare_batch_lstm)

RunningAverage(output_transform=lambda x: x).attach(engine, 'average_loss')

pbar = ProgressBar(persist=True)
pbar.attach(engine, ['average_loss'])

log_dir = 'output/'
tb_logger = TensorboardLogger(log_dir=log_dir)

tb_logger.attach(
    engine,
    log_handler=OutputHandler(
        tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all"
    ),
    event_name=Events.ITERATION_COMPLETED(every=1),
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




In [None]:
# @engine.on(Events.ITERATION_COMPLETED(every=50))
# def log_training_loss_every_50_iterations(engine):
#     print(f"{engine.state.epoch} / {engine.state.max_epochs} : {engine.state.iteration} - loss: {engine.state.output:.6f}")

# @trainer.on(Events.EPOCH_COMPLETED)
# def log_training_results(trainer):
#     evaluator.run(train_loader)
#     metrics = evaluator.state.metrics
#     print("Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
#           .format(trainer.state.epoch, metrics['accuracy'], metrics['nll']))

# @trainer.on(Events.EPOCH_COMPLETED)
# def log_validation_results(trainer):
#     evaluator.run(val_loader)
#     metrics = evaluator.state.metrics
#     print("Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
#           .format(trainer.state.epoch, metrics['accuracy'], metrics['nll']))

engine.run(batcher, max_epochs=2)

HBox(children=(IntProgress(value=0, max=6932), HTML(value='')))