In [1]:
from pathlib import Path
import pandas as pd
import pickle
import numpy as np
import shutil
from tqdm import tqdm

import torch
from torch.nn import functional as F

from src.models import InsiderClassifier, LSTM_Encoder
from src.params import get_params
from src.dataset import CertDataset, create_data_loaders

from sklearn.preprocessing import OneHotEncoder

In [2]:
# output_dir = Path(r'C:\Users\Mvideo\Google Drive\Datasets\CERT_output')
# answers_dir = Path(r"C:/Users/Mvideo/Downloads/answers")

output_dir = Path(r'C:\Users\admin\Google Drive\Datasets\CERT_output')
answers_dir = Path(r"C:\Users\admin\Google Drive\Datasets\CERT\answers")
main_answers_file = answers_dir / "insiders.csv"

lstm_checkpoint = output_dir / 'checkpoints/lstm/final2-nll/final_model_3040.pth'
assert(lstm_checkpoint.is_file())

# run_name = 'cnn/test-weighted-auc'
# log_dir = output_dir / 'logs' / run_name
# checkpoint_dir = output_dir / 'checkpoints' / run_name

# # assert(not log_dir.is_dir())
# # assert(not checkpoint_dir.is_dir())

# if log_dir.is_dir():
#     shutil.rmtree(log_dir)
# if checkpoint_dir.is_dir():
#     shutil.rmtree(checkpoint_dir)

In [3]:
actions, targets = CertDataset.prepare_dataset(output_dir / 'aggregated.pkl', main_answers_file, min_length=50, max_length=200)

# Подготовка LSTM-векторов

In [4]:
device = 'cuda'
batch = 256

cert_dataset = CertDataset(actions, targets)
train_loader, val_loader = create_data_loaders(cert_dataset, validation_split=0.3, random_seed=0, batch_size=batch)

params = get_params()

In [5]:
lstm_encoder = LSTM_Encoder(params['model']['lstm_encoder'])

device = 'cuda'

lstm_encoder.requires_grad = False
lstm_encoder.eval()
lstm_encoder.load_state_dict(
    torch.load(lstm_checkpoint, map_location=torch.device(device)), strict=True
)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [6]:
def prepare_batch(batch, device=None, non_blocking=None, num_classes=64, train=True):

    actions = batch['actions']

    actions = actions.to(device).to(torch.int64)
    actions = F.one_hot(actions, num_classes=64).float()
    
    return actions, batch['targets']

In [7]:
def prepare_encoded_dataset(loader, model_encoder):
    
    all_targets = None
    vecs = None

    for batch in tqdm(loader):
        actions, targets = prepare_batch(batch)

        result = model_encoder(actions)
        result = result.detach().cpu().numpy()
        targets = targets.detach().cpu().numpy()

        if all_targets is not None:
            all_targets = np.concatenate([all_targets, targets], axis=0)
        else:
            all_targets = targets

        if vecs is not None:
            vecs = np.concatenate([vecs, result], axis=0)
        else:
            vecs = result
    
    return vecs, all_targets

In [8]:
val_vecs, val_targets = prepare_encoded_dataset(val_loader, lstm_encoder)
val_vecs.shape, val_targets.shape

100%|████████████████████████████████████████| 130/130 [05:38<00:00,  2.61s/it]


((33276, 200, 40), (33276,))

In [9]:
train_vecs, train_targets = prepare_encoded_dataset(train_loader, lstm_encoder)
train_vecs.shape, train_targets.shape

100%|████████████████████████████████████████| 304/304 [20:18<00:00,  4.01s/it]


((77644, 200, 40), (77644,))

In [13]:
train_vecs.shape, val_vecs.shape, train_targets.shape, val_targets.shape

((77644, 200, 40), (33276, 200, 40), (77644,), (33276,))

In [11]:
np.save(lstm_checkpoint.parent / 'train_vecs.npy', train_vecs)
np.save(lstm_checkpoint.parent / 'val_vecs.npy', val_vecs)

np.save(lstm_checkpoint.parent / 'train_targets.npy', train_targets)
np.save(lstm_checkpoint.parent / 'val_targets.npy', val_targets)

In [12]:
train_vecs = np.load(lstm_checkpoint.parent / 'train_vecs.npy')
val_vecs = np.load(lstm_checkpoint.parent / 'val_vecs.npy')

train_targets = np.load(lstm_checkpoint.parent / 'train_targets.npy')
val_targets = np.load(lstm_checkpoint.parent / 'val_targets.npy')

In [14]:
train_vecs.shape, val_vecs.shape, train_targets.shape, val_targets.shape

((77644, 200, 40), (33276, 200, 40), (77644,), (33276,))

# Тестируем классификаторы

In [15]:
from sklearn.svm import SVC

In [17]:
svm_model = SVC(kernel='linear')

In [19]:
train_vecs =train_vecs.reshape(train_vecs.shape[0], -1)

(77644, 8000)

In [None]:
svm_model.fit(train_vecs, train_targets)

In [71]:
train_targets.shape

(77644,)

In [62]:
train_vecs.shape

(77644, 200, 40)