# Runing stacking for each fold and then by each day.

In [3]:
import os
import pickle
import numpy as np
import pandas as pd
from glob import glob
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from joblib import dump

from src.utils import get_classifier

In [28]:
strategies = [
    "igcngru_features",
    "idarkvec",
    "gcn_features",
    "igcn_features",
    "gcngru_features"]

strategies.sort()
k_n = 'k3'
data_dir = "stacking_predictions/out"
input_stacking_dir = "stacking_data/majority_voting"
n_folds = 10
classifier = "lr"

In [29]:
df = pd.read_csv("stacking_predictions/out/k3/train/gcn_20211221_fold00.csv")
df.y_true.unique()

array(['censys', 'mirai', 'unknown', 'unk_bruteforcer', 'shodan',
       'netsystems', 'securitytrails', 'driftnet', 'unk_spammer',
       'internetcensus', 'onyphe', 'intrinsec', 'unk_exploiter',
       'shadowserver'], dtype=object)

In [30]:
probs_cols = [
        'mirai',
        'unk_bruteforcer',
        'unk_spammer',
        'shadowserver',
        'driftnet',
        'internetcensus',
        'censys',
        'rapid7',
        'onyphe',
        'netsystems',
        'shodan',
        'unk_exploiter',
        'securitytrails',
        'intrinsec',
        'unknown'
    ]
probs_cols.sort()

label_to_idx = { l:idx for idx, l in enumerate(probs_cols) }

In [31]:
probs_cols

['censys',
 'driftnet',
 'internetcensus',
 'intrinsec',
 'mirai',
 'netsystems',
 'onyphe',
 'rapid7',
 'securitytrails',
 'shadowserver',
 'shodan',
 'unk_bruteforcer',
 'unk_exploiter',
 'unk_spammer',
 'unknown']

In [32]:
def get_strat_name(file_path):

    file_name = os.path.basename(file_path)
    tokens = file_name.split("_")
    tokens.pop()
    tokens.pop()
    return '_'.join(tokens)

In [33]:
def load_preds(data_dir, strategies, k_n, day, fold, train_test, probs_cols):

    X = []
    files = glob(f"{data_dir}/{k_n}/{train_test}/*_{day}_fold0{fold}.csv")
    
    idxs_files = [ [i, get_strat_name(f)] for i, f in enumerate(files) ]
    idxs_files.sort(key=lambda x: x[1])
    files = [ files[i] for i, _ in idxs_files ]
    
    for file_path in files:
        strat = get_strat_name(file_path)
        if strat in strategies:
            df = pd.read_csv(file_path)
            df = df[df.y_true != 'unknown']
            X.append(df[probs_cols].values.argmax(axis=1))
    
    return np.vstack(X).T, df.y_true.values

In [34]:
X_test, y_test = load_preds(data_dir, strategies, k_n, '20211221', 0, "test", probs_cols)

In [35]:
X_test.shape

(405, 5)

In [36]:
most_frequent_per_row = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=X_test)
most_frequent_per_row

array([ 4,  4, 13,  4,  0, 10, 13,  0,  0,  4,  2, 11,  0,  1,  0,  1,  0,
        0, 14,  4,  8,  0, 13,  2,  5,  5,  1, 14,  0, 13,  2,  1,  4,  4,
        4, 10,  4,  4,  2,  0,  0,  1,  4, 14,  0,  1,  1, 11,  2,  0,  0,
        0,  1,  0,  2,  0,  1,  4,  4,  8,  4,  4, 13,  4,  4, 13,  2,  4,
        4, 11,  1,  1,  4,  4,  2,  5,  2,  1, 11, 10,  1, 13,  1,  4, 13,
        4,  1, 11,  4,  2,  4,  1,  4,  2,  9,  4,  9, 11,  4,  9,  4,  9,
        4, 14,  9,  9,  4,  4,  9,  4,  9,  4,  4,  9,  1,  9,  4,  0,  1,
        0, 14,  1,  4,  4,  4, 11,  9,  9,  1,  1,  4,  4,  1,  4,  4,  4,
        4,  4,  4,  4,  1,  4,  4,  4,  4,  4, 14,  4,  4,  6,  4,  9, 11,
       11,  9,  4,  1,  9,  9,  1,  4, 13,  4,  9,  4,  9,  4,  4,  4,  4,
        4, 11,  9,  4,  4,  4,  1,  2,  9,  9,  4,  9,  2,  4,  9,  4, 14,
        4,  9, 14,  4,  4,  3,  4,  9, 14,  4,  4,  4,  4,  4,  4,  1,  4,
        4,  4,  4,  4,  2,  4,  6,  4,  4,  1, 11,  4,  6,  2,  4,  4,  4,
        9,  2,  4,  4,  4

In [37]:
days = sorted([ f.split('/')[-1].split('_')[-2] for f in glob(f"stacking_predictions/out/{k_n}/test/idarkvec*_fold00.csv") ])

### In the following cells we run the stacking for each fold and also take F1 by fold.

In [46]:
strats_posfix = '-'.join(sorted(strategies))
reporte = {}
for day in days:
    reporte[day] = {}
    for fold in np.arange(10):
        X_test, y_test = load_preds(data_dir, strategies, k_n, day, fold, "test", probs_cols)
        argmax = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=X_test)
        preds = np.array([ probs_cols[c] for c in argmax ])
        reporte[day][fold] = {}
        reporte[day][fold]["y"] = y_test
        reporte[day][fold]["preds"] = preds
        output_dir = f"{input_stacking_dir}/{day}/{strats_posfix}/{fold}"
        os.makedirs(output_dir, exist_ok=True)
        np.savez(f"{output_dir}/data.npz",
                X_test=X_test,
                y_test=y_test)

output_dir = f"reports/stacking-v-0.3/{strats_posfix}"
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/{k_n}.pkl", 'wb') as fd:
    pickle.dump(reporte, fd)


In [47]:
f1_points = {}
for day in reporte:
    f1_points[day] = {}
    for label in probs_cols:
        for fold in reporte[day]:
            y = reporte[day][fold]['y']
            preds = reporte[day][fold]["preds"]
            scores = classification_report(y, preds, labels=np.unique(y), zero_division=0, output_dict=True)
            if label not in f1_points[day] and label in scores:
                f1_points[day][label] = []
            if label in scores:
                f1_points[day][label].append(scores[label]['f1-score'])
        
        if label in f1_points[day]:
            f1_points[day][label] = np.mean(f1_points[day][label])


In [48]:
f1_table = pd.DataFrame(f1_points)
f1_table

Unnamed: 0,20211221,20211222,20211223,20211224,20211225,20211226,20211227,20211228,20211229,20211230,20211231
censys,0.951265,0.899467,0.933484,0.910398,0.908167,0.926455,0.924428,0.924865,0.932935,0.915937,0.929622
driftnet,0.966097,0.969696,0.975099,0.980692,0.969296,0.986499,0.990345,0.980692,0.982722,0.982722,0.986424
internetcensus,0.933755,0.962827,0.957435,0.953458,0.948529,0.96194,0.962564,0.933847,0.971003,0.939335,0.9557
intrinsec,0.866667,,,,,,0.516667,0.766667,0.966667,,
mirai,0.980956,0.981879,0.978586,0.987246,0.983614,0.981363,0.9868,0.986907,0.981472,0.980472,0.980413
netsystems,0.951429,0.929401,0.988889,0.985714,0.963492,,0.979798,1.0,0.968889,0.972222,1.0
onyphe,0.960599,0.956725,0.956725,0.953947,0.944307,0.978947,0.966563,0.866667,,,
securitytrails,1.0,1.0,1.0,0.98,1.0,1.0,1.0,,1.0,1.0,1.0
shadowserver,0.975903,0.97485,0.97837,0.973177,0.970823,0.980256,0.972834,0.959858,0.971904,0.965234,0.972709
shodan,0.755,0.832381,0.872381,0.88127,0.842381,0.812222,0.803095,0.822857,0.814603,0.754603,0.819524


In [49]:
f1_points

{'20211221': {'censys': 0.951264623121736,
  'driftnet': 0.9660967079945992,
  'internetcensus': 0.9337553433309808,
  'intrinsec': 0.8666666666666666,
  'mirai': 0.9809561412345253,
  'netsystems': 0.9514285714285714,
  'onyphe': 0.9605994152046783,
  'securitytrails': 1.0,
  'shadowserver': 0.9759026213680672,
  'shodan': 0.7550000000000001,
  'unk_bruteforcer': 0.6430927533736268,
  'unk_exploiter': 0.6611111111111112,
  'unk_spammer': 0.5143031838068708},
 '20211222': {'censys': 0.8994666473627573,
  'driftnet': 0.969695919006209,
  'internetcensus': 0.9628273922528197,
  'mirai': 0.9818786845332846,
  'netsystems': 0.9294011544011545,
  'onyphe': 0.9567251461988304,
  'securitytrails': 1.0,
  'shadowserver': 0.9748502211717331,
  'shodan': 0.8323809523809522,
  'unk_bruteforcer': 0.6400518358794491,
  'unk_exploiter': 0.0,
  'unk_spammer': 0.44349544612626995},
 '20211223': {'censys': 0.9334842118665648,
  'driftnet': 0.9750985444976248,
  'internetcensus': 0.9574353433681164,
  '

In [51]:
article_order_cols = ["mirai",
"unk_bruteforcer",
"unk_spammer",
"shadowserver",
"driftnet",
"internetcensus",
"censys",
"rapid7",
"onyphe",
"netsystems",
"shodan",
"unk_exploiter",
"securitytrails",
"intrinsec"]

for label in article_order_cols:
    mean = []
    for day in f1_points:
        if label in f1_points[day]:
            mean.append(f1_points[day][label])
    m = np.mean(mean)
    #print(f"{label};{m}")
    print(m)


0.9827007654973986
0.6616769416449007
0.45016721695306394
0.9723563054876795
0.9791167441159515
0.9527632479549897
0.9233656984448476
0.9813537422858617
0.9480601780185759
0.9739834054834053
0.8191197691197692
0.07070707070707072
0.998
0.7791666666666666
