# Runing stacking for each fold and then by each day.

In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from joblib import dump
from tqdm import tqdm
from imblearn.under_sampling import RandomUnderSampler

from sys import path
path.append("..")
from src.utils import get_classifier, load_pickle

In [2]:
SOURCES = ["darknet", "honeypot"]
SOURCES.sort()
SOURCES_SUFIX = "darknet-honeypot"
CASE = "1"
strategies = ["igcngru_features", "idarkvec", "features"]
STRATS_SUFIX = "-".join(sorted(strategies))

strategies.sort()
k_n = "k3"
data_dir = "../data/2022/input/stacking_predictions/out"
base_output = f"../data/2022/output/{SOURCES_SUFIX}/{STRATS_SUFIX}/{CASE}"
n_folds = 10
classifier = "lr"

In [3]:
df = pd.read_csv(f"../data/2022/input/stacking_predictions/out/k3/darknet/test/idarkvec_20221021_fold00.csv")
df.y_true.unique()

array(['censys', 'mirai', 'unk_bruteforcer', 'unk_spammer', 'driftnet',
       'shodan', 'internetcensus', 'onyphe', 'securitytrails', 'ipip',
       'intrinsec', 'shadowserver', 'u_mich', 'unk_exploiter'],
      dtype=object)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,src_ip,censys,driftnet,internetcensus,intrinsec,ipip,mirai,onyphe,rapid7,securitytrails,shadowserver,shodan,u_mich,unk_bruteforcer,unk_exploiter,unk_spammer,unknown,y_true
0,12982,167.94.138.102,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,censys
1,12983,167.94.138.146,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,censys
2,12984,118.40.8.149,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
3,12985,103.126.245.10,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
4,12986,193.142.146.35,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unk_bruteforcer


In [5]:
with open("../data/2022/input/skf/stratification/stratification.json", 'r') as fd:
    splits = json.load(fd)

In [6]:
splits.keys(), splits["20221021"].keys(), splits["20221021"]["honeypot"][0][1][:5]

(dict_keys(['20221021', '20221022', '20221023', '20221024', '20221025', '20221026', '20221027', '20221028', '20221029', '20221030', '20221031']),
 dict_keys(['darknet', 'honeypot']),
 ['117.187.173.104',
  '222.185.146.149',
  '117.196.109.167',
  '170.106.173.40',
  '123.130.210.226'])

In [7]:
probs_cols = [
    "censys",
    "driftnet",
    "internetcensus",
    "intrinsec",
    "ipip",
    "mirai",
    "onyphe",
    "rapid7",
    "securitytrails",
    "shadowserver",
    "shodan",
    "u_mich",
    "unk_bruteforcer",
    "unk_exploiter",
    "unk_spammer",
    "unknown"
]
probs_cols.sort()

label_to_idx = {l: idx for idx, l in enumerate(probs_cols)}

In [8]:
probs_cols

['censys',
 'driftnet',
 'internetcensus',
 'intrinsec',
 'ipip',
 'mirai',
 'onyphe',
 'rapid7',
 'securitytrails',
 'shadowserver',
 'shodan',
 'u_mich',
 'unk_bruteforcer',
 'unk_exploiter',
 'unk_spammer',
 'unknown']

In [9]:
def get_strat_name(file_path):

    file_name = os.path.basename(file_path)
    tokens = file_name.split("_")
    tokens.pop()
    tokens.pop()
    return '_'.join(tokens)

In [10]:
def load_probs(data_dir,
               strategies,
               sources,
               splits,
               k_n,
               day,
               fold,
               train_test,
               probs_cols):

    train_test_id = 0 if train_test == "train" else 1

    # Getting the intersection.
    ip_set_1 = set(splits[day][sources[0]][fold][train_test_id])
    ip_set_2 = set(splits[day][sources[1]][fold][train_test_id])
    ip_set = list(ip_set_1.intersection(ip_set_2))

    X = []
    y = None
    for source in sources:
        for strat in strategies:
            df = pd.read_csv(
                f"{data_dir}/{k_n}/{source}/{train_test}/{strat}_{day}_fold0{fold}.csv"
            )
            df = df[df.src_ip.isin(ip_set)].sort_values(by=["src_ip"])
            X.append(df[probs_cols].values)
            if y is None:
                y = df.y_true.values

    return np.hstack(X), df.y_true.values

In [11]:
X_train, y_train = load_probs(data_dir, strategies, SOURCES, splits, k_n, '20221021', 0, "train", probs_cols)
print("*")
X_test, y_test = load_probs(data_dir, strategies, SOURCES, splits, k_n, '20221021', 0, "test", probs_cols)

*


In [12]:
X_train.shape

(10691, 96)

In [13]:
clf = LogisticRegression(max_iter=500, n_jobs=-1)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [14]:
pd.DataFrame(classification_report(y_test, preds, output_dict=True, zero_division=0.0)).T

Unnamed: 0,precision,recall,f1-score,support
censys,1.0,1.0,1.0,11.0
driftnet,1.0,1.0,1.0,25.0
internetcensus,1.0,1.0,1.0,23.0
intrinsec,1.0,1.0,1.0,1.0
ipip,1.0,1.0,1.0,1.0
mirai,1.0,1.0,1.0,689.0
onyphe,1.0,1.0,1.0,9.0
securitytrails,1.0,1.0,1.0,2.0
shadowserver,1.0,1.0,1.0,29.0
shodan,0.666667,0.666667,0.666667,3.0


In [15]:
days = sorted([ f.split('/')[-1].split('_')[-2] for f in glob(f"{data_dir}/{k_n}/darknet/test/idarkvec*_fold00.csv") ])
days

['20221021',
 '20221022',
 '20221023',
 '20221024',
 '20221025',
 '20221026',
 '20221027',
 '20221028',
 '20221029',
 '20221030',
 '20221031']

### In the following cells we run the stacking for each fold and also take F1 by fold.

In [16]:
reporte = {}
for day in days:
    reporte[day] = {}
    for fold in np.arange(10):
        X_train, y_train = load_probs(data_dir, strategies, SOURCES, splits, k_n, day, fold, "train", probs_cols)
        X_test, y_test = load_probs(data_dir, strategies, SOURCES, splits, k_n, day, fold, "test", probs_cols)
        clf = get_classifier(classifier)

        # applying undersampling.
        n = pd.value_counts(y_train).values[2]
        us = RandomUnderSampler(sampling_strategy={"unknown": n, "mirai": n}, random_state=42)
        #X_train, y_train = us.fit_resample(X_train, y_train)
        us_idxs, y_train = us.fit_resample(np.arange(X_train.shape[0]).reshape(-1, 1), y_train)
        us_idxs = us_idxs.reshape(-1)
        X_train = X_train[us_idxs]

        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        probas = clf.predict_proba(X_test)
        reporte[day][fold] = {}
        reporte[day][fold]["y"] = y_test
        reporte[day][fold]["preds"] = preds
        output_dir = f"{base_output}/stacking_data/{day}/{fold}"
        os.makedirs(output_dir, exist_ok=True)
        np.savez(f"{output_dir}/data.npz",
                X_train=X_train,
                X_test=X_test,
                y_train=y_train,
                y_test=y_test,
                probas=probas,
                us_idxs=us_idxs)
        dump(clf, f"{output_dir}/{classifier}.joblib")

output_dir = f"{base_output}/report"
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/{k_n}.pkl", 'wb') as fd:
    pickle.dump(reporte, fd)

In [17]:
report_path = f"{base_output}/report/{k_n}.pkl"
reporte = load_pickle(report_path)

In [18]:
class_scores = {}
for label in tqdm(probs_cols + ["macro avg"]):
    class_scores[label] = []
    for day in reporte:
        y, preds = [], []
        for fold in reporte[day]:
            y.append(reporte[day][fold]["y"])
            preds.append(reporte[day][fold]["preds"])

        y = np.hstack(y)
        preds = np.hstack(preds)

        scores = classification_report(
            y, preds, labels=np.unique(y), zero_division=0, output_dict=True
        )
        if label in scores:
            class_scores[label].append(scores[label]["f1-score"])

100%|██████████| 17/17 [00:27<00:00,  1.63s/it]


In [19]:
for label in probs_cols + ["macro avg"]:
    v = np.trunc(np.mean(class_scores[label])* 100) / 100
    print(f"{label};{v}")

censys;0.99
driftnet;0.99
internetcensus;0.99
intrinsec;0.97
ipip;0.99
mirai;0.99
onyphe;0.99
rapid7;1.0
securitytrails;1.0
shadowserver;1.0
shodan;0.85
u_mich;0.98
unk_bruteforcer;0.89
unk_exploiter;0.53
unk_spammer;0.87
unknown;nan
macro avg;0.93


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [20]:
for label in probs_cols + ["macro avg"]:
    v = np.trunc(np.std(class_scores[label]) * 100) / 100
    print(f"{label};{v}")

censys;0.0
driftnet;0.0
internetcensus;0.0
intrinsec;0.02
ipip;0.01
mirai;0.0
onyphe;0.0
rapid7;0.0
securitytrails;0.0
shadowserver;0.0
shodan;0.02
u_mich;0.02
unk_bruteforcer;0.01
unk_exploiter;0.3
unk_spammer;0.01
unknown;nan
macro avg;0.02


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
