In [22]:
import os
import json
import pickle
from glob import glob
from itertools import product

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, classification_report

import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, random_split, TensorDataset
from pytorch_lightning.callbacks import ModelCheckpoint

In [26]:
def get_X_y(DATA_SOURCE: str,
            model: str,
            day: str,
            train_test: str,
            fold: int,
            label_to_idx: dict,
            ip_set: list = None):
    
    df = pd.read_csv(f"data/2022/input/reps/out/k3/{DATA_SOURCE}/{train_test}/{model}_{day}_fold0{fold}.csv")
    df.sort_values(by=["src_ip"], inplace=True)
    # Taking the intersection.
    df = df[df.src_ip.isin(ip_set)]
    df_copy = df.drop(columns=["src_ip"])
    y = df_copy.label.values
    X = df_copy.drop(columns=["label"]).values
    
    return X, [ label_to_idx[i] for i in y]

def load_train_test(sources: list,
                    models: list,
                    day: str,
                    fold: int,
                    label_to_idx: dict,
                    inter_set_train: list,
                    inter_set_test: list):
    
    X_train, X_test = [], []
    for source, model in product(sources, models):
        
        X, y_train = get_X_y(source, model, day, "train", fold, label_to_idx, inter_set_train)
        X_train.append(X)
        
        X, y_test = get_X_y(source, model, day, "test", fold, label_to_idx, inter_set_test)
        X_test.append(X)
    
    X_train = np.hstack(X_train)
    X_test = np.hstack(X_test)
    
    return X_train, X_test, y_train, y_test
    
class DataHandler:
    
    def __init__(self,
                 sources: str,
                 base_models: list,
                 day: str,
                 fold: int,
                 label_to_idx: dict,
                 inter_set_train: list,
                 inter_set_test: list) -> None:
    
        self.sources = sources
        self.base_models = base_models
        self.day = day
        self.fold = fold
        self.label_to_idx = label_to_idx
        self.inter_set_train = inter_set_train
        self.inter_set_test = inter_set_test
    
    def build_data_loaders(self) -> None:
        
        X_train, X_test, y_train, y_test = load_train_test(
            self.sources,
            self.base_models,
            self.day,
            self.fold,
            self.label_to_idx,
            self.inter_set_train,
            self.inter_set_test
        )
        
        self.y_test = y_test
        self.dim = X_train.shape[1]
        self.n_labels = len(self.label_to_idx)

        train_set = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
        test_set = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

        # Split the dataset into training and validation sets
        train_len = int(0.9 * len(train_set))
        val_len = len(train_set) - train_len

        train_set, val_set = random_split(train_set, [train_len, val_len])

        # Create DataLoaders
        self.train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=4)
        self.val_loader = DataLoader(val_set, batch_size=64, num_workers=4)
        self.test_loader = DataLoader(test_set, batch_size=64, num_workers=4)

class NeuralNet(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim, lr=1e-3):
        super(NeuralNet, self).__init__()
        self.save_hyperparameters()

        # Define layers
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)
        self.criterion = nn.CrossEntropyLoss()
        self.lr = lr

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.layer2(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        out = self(x)
        loss = self.criterion(out, y)
        self.log_dict({"val_loss": loss }, prog_bar=True, on_epoch=True)
        
    def predict_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        preds = torch.argmax(logits, dim=1)
        return preds
        
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.lr)
        
        # Linear learning rate scheduler
        scheduler = LambdaLR(optimizer, lr_lambda=lambda epoch: 1 - epoch / self.trainer.max_epochs)
        return [optimizer], [scheduler]

def get_intersec(strat: dict, day: str, sources: list, fold: int) -> tuple:
    
    train_ips = set(strat[day][sources[0]][fold][0]).intersection(strat[day][sources[1]][fold][0])
    test_ips = set(strat[day][sources[0]][fold][1]).intersection(strat[day][sources[1]][fold][1])
    return train_ips, test_ips
    

In [24]:
SOURCES = ["darknet", "honeypot"]
BASE_MODELS = ["features", "idarkvec", "igcngru_features"]

In [27]:
N_FOLDS = 2
MAX_EPOCHS = 3

In [None]:
probs_cols = [
    "censys",
    "driftnet",
    "internetcensus",
    "intrinsec",
    "ipip",
    "mirai",
    "onyphe",
    "rapid7",
    "securitytrails",
    "shadowserver",
    "shodan",
    "u_mich",
    "unk_bruteforcer",
    "unk_exploiter",
    "unk_spammer",
    "unknown"
]
probs_cols.sort()

label_to_idx = {l: idx for idx, l in enumerate(probs_cols)}

In [None]:
days = sorted([ f.split('/')[-1].split('_')[-2] for f in glob(f"data/2022/input/reps/out/k3/darknet/test/idarkvec*_fold00.csv") ])[:2]

In [None]:
with open("data/2022/input/skf/stratification/stratification.json", 'r') as fd:
    strat = json.load(fd)

In [None]:
predictions = {}
for day, fold in product(days, np.arange(N_FOLDS)):
    
    # Getting intersection sets.
    train_ips, test_ips = get_intersec(strat, day, SOURCES, fold)
    
    # Loading data.
    data = DataHandler(SOURCES, BASE_MODELS, day, fold, label_to_idx, train_ips, test_ips)
    data.build_data_loaders()
    
    # Setting network's size
    input_dim = data.dim
    hidden_dim = input_dim
    output_dim = data.n_labels
    
    # Model Logger.
    output = f"data/2022/output/nn/1/{data.day}/{data.fold}/"
    os.makedirs(output, exist_ok=True)

    checkpoint_callbacker = ModelCheckpoint(
        monitor="val_loss",
        dirpath=output,
        filename="model",
        save_top_k=1,
        mode="min"
    )

    # Define model, trainer, and train the model
    model = NeuralNet(
        input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, lr=1e-3
    )
    trainer = pl.Trainer(max_epochs=MAX_EPOCHS, callbacks=[checkpoint_callbacker])
    trainer.fit(model, data.train_loader, data.val_loader)
    
    # Load the best checkpoint
    best_model_path = checkpoint_callbacker.best_model_path
    best_model = NeuralNet.load_from_checkpoint(best_model_path)

    # Prediction
    y_hat = trainer.predict(best_model, dataloaders=data.test_loader)
    y_hat = (
        torch.cat(y_hat).cpu().numpy()
    )  # Combine y_hat and move to CPU

    print(F"DAY: {day} / FOLD: {fold} - M-F1: {(100 * f1_score(data.y_test, y_hat, average='macro')):.2f}")
    if day not in predictions:
        predictions[day] = {}
    predictions[day][fold] = {
        "y_test": data.y_test,
        "y_hat": y_hat
    }
    os.system("clear")

output = f"data/2022/output/nn/1"
with open(f"{output}/preds.pkl", "wb") as fd:
    pickle.dump(predictions, fd)