In [27]:
import numpy as np
import pandas as pd
from glob import glob
from sklearn.metrics import f1_score

In [43]:
strategies = [
    "gcn",
    "igcn",
    "gcngru",
    "igcngru",
    "gcn_features",
    "igcn_features",
    "gcngru_features",
    "igcngru_features",
    "idarkvec"
]

k_n = 'k7'

In [44]:
df = pd.read_csv("stacking_predictions/out/k3/test/idarkvec_20211224_fold02.csv")

In [45]:
df.columns

Index(['mirai', 'unk_bruteforcer', 'unk_spammer', 'shadowserver', 'driftnet',
       'internetcensus', 'censys', 'rapid7', 'onyphe', 'netsystems', 'shodan',
       'unk_exploiter', 'securitytrails', 'intrinsec', 'unknown', 'y_true'],
      dtype='object')

In [46]:
def f1(df: pd.DataFrame):

    probs_cols = ['mirai',
                  'unk_bruteforcer',
                  'unk_spammer',
                  'shadowserver',
                  'driftnet',
                  'internetcensus',
                  'censys',
                  'rapid7',
                  'onyphe',
                  'netsystems',
                  'shodan',
                  'unk_exploiter',
                  'securitytrails',
                  'intrinsec',
                  'unknown']


    label_to_idx = { l:idx for idx, l in enumerate(probs_cols)  }

    # Getting only probabilities.
    probs = df.drop(columns=["y_true"])[probs_cols].copy(deep=True)
    # Getting labels.
    labels = df.y_true.copy(deep=True)
    # Building a dictionay of labels (strings) to int (class number).
    # Converting labels (string) to int (class number).
    y = [ label_to_idx[l] for l in labels ]
    # Taking predctions out of probabilities.
    preds = probs.values.argmax(axis=1)
    # Computing metrics.
    macro = f1_score(y, preds, labels=np.arange(len(probs_cols)), average="macro", zero_division=0)
    # Macro by class.
    macro_by_class = f1_score(y, preds, labels=np.arange(len(probs_cols)), average=None, zero_division=0)
    return macro, macro_by_class

In [47]:
days = sorted([ f.split('/')[-1].split('_')[-2] for f in glob(f"stacking_predictions/out/{k_n}/test/idarkvec*_fold00.csv") ])
days

['20211221',
 '20211222',
 '20211223',
 '20211224',
 '20211225',
 '20211226',
 '20211227',
 '20211228',
 '20211229',
 '20211230',
 '20211231']

In [48]:
macros = {}
for strat in strategies:
    print(strat)
    macros[strat] = {}
    for day in days:
        macros[strat][day] = {}
        for fold in np.arange(10):
            macros[strat][day][fold] = {}
            file_path = f"stacking_predictions/out/{k_n}/test/{strat}_{day}_fold0{fold}.csv"
            df = pd.read_csv(file_path)
            macro, macro_by_class = f1(df)
            macros[strat][day][fold]["macro"] = macro
            macros[strat][day][fold]["class"] = macro_by_class
        

gcn
igcn
gcngru
igcngru
gcn_features
igcn_features
gcngru_features
igcngru_features
idarkvec


In [49]:
mean_macros = []
for strat in strategies:
    print(strat)
    ms, msc = [], []
    for day in days:
        for fold in np.arange(10):
            ms.append(macros[strat][day][fold]["macro"])
            msc.append(macros[strat][day][fold]["class"])
    
    mm = np.mean(np.vstack(msc), axis=0).tolist()
    #mm.insert(0, strat)
    mm.append(np.mean(ms))
    mean_macros.append(mm)

gcn
igcn
gcngru
igcngru
gcn_features
igcn_features
gcngru_features
igcngru_features
idarkvec


In [50]:
cols = ['mirai',
        'unk_bruteforcer',
        'unk_spammer',
        'shadowserver',
        'driftnet',
        'internetcensus',
        'censys',
        'rapid7',
        'onyphe',
        'netsystems',
        'shodan',
        'unk_exploiter',
        'securitytrails',
        'intrinsec',
        'unknown',
        'avg']

In [51]:
pd.DataFrame(mean_macros, columns=cols).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mirai,0.607614,0.757512,0.659057,0.713164,0.977489,0.97768,0.852523,0.978485,0.980889
unk_bruteforcer,0.080782,0.529657,0.532095,0.532998,0.588122,0.58002,0.597799,0.623806,0.590288
unk_spammer,0.1161,0.186258,0.293664,0.17951,0.416431,0.400754,0.362436,0.421894,0.406541
shadowserver,0.119218,0.49256,0.622357,0.47348,0.885714,0.884548,0.938195,0.94472,0.955068
driftnet,0.789895,0.713749,0.858997,0.888571,0.883829,0.859148,0.966588,0.967075,0.968092
internetcensus,0.237994,0.214791,0.624131,0.400133,0.552419,0.492125,0.885715,0.908859,0.991456
censys,0.639451,0.59843,0.650917,0.684255,0.884234,0.876895,0.912288,0.914308,0.944643
rapid7,0.325795,0.267003,0.325637,0.303015,0.307011,0.302263,0.252814,0.349976,0.363636
onyphe,0.019421,0.007546,0.03602,0.019167,0.662536,0.662864,0.677765,0.668071,0.705671
netsystems,0.0,0.031169,0.119026,0.016364,0.861006,0.833204,0.697113,0.839191,0.407543
