# Runing stacking for each fold and then by each day.

In [17]:
import os
import json
import pickle
import numpy as np
import pandas as pd
from glob import glob
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from joblib import dump
from tqdm import tqdm
from imblearn.under_sampling import RandomUnderSampler

from sys import path
path.append("..")
from src.utils import get_classifier

In [30]:
SOURCES = ["darknet", "honeypot"]
DATA_SOURCE = "darknet"
CASE = "4" if DATA_SOURCE == "darknet" else "5"

strategies = ["igcngru_features", "idarkvec", "features"]
STRATS_SUFIX = "-".join(sorted(strategies))

strategies.sort()
k_n = "k3"
data_dir = "../data/2022/input/stacking_predictions/out"
base_output = f"../data/2022/output/{DATA_SOURCE}/{STRATS_SUFIX}/{CASE}"
n_folds = 10
classifier = "lr"

In [19]:
df = pd.read_csv(f"../data/2022/input/stacking_predictions/out/k3/{DATA_SOURCE}/test/idarkvec_20221021_fold00.csv")
df.y_true.unique()

array(['censys', 'mirai', 'unk_bruteforcer', 'unk_spammer', 'driftnet',
       'shodan', 'internetcensus', 'onyphe', 'securitytrails', 'ipip',
       'intrinsec', 'shadowserver', 'u_mich', 'unk_exploiter'],
      dtype=object)

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,src_ip,censys,driftnet,internetcensus,intrinsec,ipip,mirai,onyphe,rapid7,securitytrails,shadowserver,shodan,u_mich,unk_bruteforcer,unk_exploiter,unk_spammer,unknown,y_true
0,12982,167.94.138.102,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,censys
1,12983,167.94.138.146,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,censys
2,12984,118.40.8.149,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
3,12985,103.126.245.10,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
4,12986,193.142.146.35,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unk_bruteforcer


In [21]:
with open("../data/2022/input/skf/stratification/stratification.json", "r") as fd:
    splits = json.load(fd)

In [22]:
probs_cols = [
    "censys",
    "driftnet",
    "internetcensus",
    "intrinsec",
    "ipip",
    "mirai",
    "onyphe",
    "rapid7",
    "securitytrails",
    "shadowserver",
    "shodan",
    "u_mich",
    "unk_bruteforcer",
    "unk_exploiter",
    "unk_spammer",
    "unknown"
]
probs_cols.sort()

label_to_idx = {l: idx for idx, l in enumerate(probs_cols)}

In [23]:
probs_cols

['censys',
 'driftnet',
 'internetcensus',
 'intrinsec',
 'ipip',
 'mirai',
 'onyphe',
 'rapid7',
 'securitytrails',
 'shadowserver',
 'shodan',
 'u_mich',
 'unk_bruteforcer',
 'unk_exploiter',
 'unk_spammer',
 'unknown']

In [24]:
def get_strat_name(file_path):

    file_name = os.path.basename(file_path)
    tokens = file_name.split("_")
    tokens.pop()
    tokens.pop()
    return '_'.join(tokens)

In [32]:
def load_probs(data_dir,
               strategies,
               target_source,
               sources,
               splits,
               k_n,
               day,
               fold,
               train_test,
               probs_cols):

    train_test_id = 0 if train_test == "train" else 1

    # Getting the intersection.
    if train_test == "test":
        ip_set_1 = set(splits[day][sources[0]][fold][train_test_id])
        ip_set_2 = set(splits[day][sources[1]][fold][train_test_id])
        ip_set = list(ip_set_1.intersection(ip_set_2))

    X = []
    y = None
    for strat in strategies:
        df = pd.read_csv(
            f"{data_dir}/{k_n}/{target_source}/{train_test}/{strat}_{day}_fold0{fold}.csv"
        )
        
        if train_test == "test":
            df = df[df.src_ip.isin(ip_set)]
        X.append(df[probs_cols].values)
        
        if y is None:
            if train_test == "test":
                y = df.y_true.values
    return np.hstack(X), df.y_true.values

In [33]:
X_train, y_train = load_probs(data_dir, strategies, SOURCES, splits, k_n, '20221021', 0, "train", probs_cols)
X_test, y_test = load_probs(data_dir, strategies, SOURCES, splits, k_n, '20221021', 0, "test", probs_cols)

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 12982 and the array at index 3 has size 17639

In [10]:
clf = LogisticRegression(max_iter=500, n_jobs=-1)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [11]:
pd.DataFrame(classification_report(y_test, preds, output_dict=True, zero_division=0.0)).T

Unnamed: 0,precision,recall,f1-score,support
censys,1.0,0.818182,0.9,11.0
driftnet,1.0,1.0,1.0,25.0
internetcensus,0.956522,0.956522,0.956522,23.0
intrinsec,1.0,1.0,1.0,1.0
ipip,1.0,1.0,1.0,1.0
mirai,1.0,1.0,1.0,804.0
onyphe,1.0,1.0,1.0,9.0
securitytrails,1.0,1.0,1.0,2.0
shadowserver,1.0,1.0,1.0,29.0
shodan,0.5,0.333333,0.4,3.0


In [12]:
days = sorted([ f.split('/')[-1].split('_')[-2] for f in glob(f"{data_dir}/{k_n}/{DATA_SOURCE}/test/idarkvec*_fold00.csv") ])
days

['20221021',
 '20221022',
 '20221023',
 '20221024',
 '20221025',
 '20221026',
 '20221027',
 '20221028',
 '20221029',
 '20221030',
 '20221031']

### In the following cells we run the stacking for each fold and also take F1 by fold.

In [34]:
strats_posfix = '-'.join(sorted(strategies))
reporte = {}
for day in days:
    reporte[day] = {}
    for fold in np.arange(10):
        X_train, y_train = load_probs(data_dir, strategies, k_n, DATA_SOURCE, day, fold, "train", probs_cols)
        X_test, y_test = load_probs(data_dir, strategies, k_n, DATA_SOURCE, day, fold, "test", probs_cols)
        clf = get_classifier(classifier)

        # applying undersampling.
        n = pd.value_counts(y_train).values[2]
        us = RandomUnderSampler(sampling_strategy={"unknown": n})
        X_train, y_train = us.fit_resample(X_train, y_train)

        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        probas = clf.predict_proba(X_test)
        reporte[day][fold] = {}
        reporte[day][fold]["y"] = y_test
        reporte[day][fold]["preds"] = preds
        output_dir = f"{base_output}/stacking_data/{day}/{fold}"
        os.makedirs(output_dir, exist_ok=True)
        np.savez(f"{output_dir}/data.npz",
                X_train=X_train,
                X_test=X_test,
                y_train=y_train,
                y_test=y_test,
                probas=probas)
        dump(clf, f"{output_dir}/{classifier}.joblib")

output_dir = f"{base_output}/report"
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/{k_n}.pkl", 'wb') as fd:
    pickle.dump(reporte, fd)

In [35]:
pd.value_counts(y_train)

mirai              6559
unk_bruteforcer    1346
unknown            1346
unk_spammer         521
shadowserver        247
driftnet            246
internetcensus      197
censys              123
unk_exploiter        53
shodan               26
ipip                 18
securitytrails       16
intrinsec            14
Name: count, dtype: int64

In [36]:
class_scores = {}
for label in tqdm(probs_cols):
    class_scores[label] = []
    for day in reporte:
        y, preds = [], []
        for fold in reporte[day]:
            y.append(reporte[day][fold]["y"])
            preds.append(reporte[day][fold]["preds"])

        y = np.hstack(y)
        preds = np.hstack(preds)

        scores = classification_report(
            y, preds, labels=np.unique(y), zero_division=0, output_dict=True
        )
        if label in scores:
            class_scores[label].append(scores[label]["f1-score"])

100%|██████████| 16/16 [00:51<00:00,  3.23s/it]


In [37]:
for label in probs_cols:
    v = np.trunc(np.mean(class_scores[label])* 100) / 100
    print(f"{label};{v}")

censys;0.93
driftnet;0.99
internetcensus;0.98
intrinsec;0.74
ipip;0.61
mirai;0.99
onyphe;0.98
rapid7;0.99
securitytrails;1.0
shadowserver;0.99
shodan;0.78
u_mich;0.94
unk_bruteforcer;0.95
unk_exploiter;0.9
unk_spammer;0.83
unknown;nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
