In [88]:
import pickle
import numpy as np
import pandas as pd
import scipy.stats as stats
import scikit_posthocs as sp
from sklearn.metrics import classification_report

In [294]:
k_n = 'k7'

strategies = [
    "gcn",
    "igcn",
    "gcngru",
    "igcngru",
    "gcn_features",
    "igcn_features",
    "gcngru_features",
    "igcngru_features",
    "idarkvec"
]

In [295]:
with open(f"reports/stacking-v-0.2/{k_n}.pkl", "rb") as fd:
    stacking_reporte = pickle.load(fd)

In [296]:
days = list(stacking_reporte.keys())
days.sort()

In [297]:
def f1(df: pd.DataFrame):

    probs_cols = ['mirai',
                  'unk_bruteforcer',
                  'unk_spammer',
                  'shadowserver',
                  'driftnet',
                  'internetcensus',
                  'censys',
                  'rapid7',
                  'onyphe',
                  'netsystems',
                  'shodan',
                  'unk_exploiter',
                  'securitytrails',
                  'intrinsec',
                  'unknown']
    
    # Getting only probabilities.
    probs = df[probs_cols].copy(deep=True)
    # Getting labels.
    labels = df.y_true.copy(deep=True)
    # Taking predctions out of probabilities.
    preds = probs.idxmax(axis=1)
    # Macro by class.
    return classification_report(labels, preds, labels=np.unique(labels), output_dict=True, zero_division=0.0)

In [298]:
strat_scores = {}
for strat in strategies:
    strat_scores[strat] = []
    for day in days:
        for fold in np.arange(10):
            df = pd.read_csv(f"stacking_predictions/out/{k_n}/test/{strat}_{day}_fold0{fold}.csv")
            strat_scores[strat].append(f1(df))
            

In [299]:
strat_scores["gcn"][0].keys()

dict_keys(['censys', 'driftnet', 'internetcensus', 'intrinsec', 'mirai', 'netsystems', 'onyphe', 'securitytrails', 'shadowserver', 'shodan', 'unk_bruteforcer', 'unk_exploiter', 'unk_spammer', 'micro avg', 'macro avg', 'weighted avg'])

In [300]:
stacking_scores = []
for day in days:
    for fold in np.arange(10):
        labels = stacking_reporte[day][fold]['y']
        preds = stacking_reporte[day][fold]['preds']
        stacking_scores.append(classification_report(labels, preds, labels=np.unique(labels), output_dict=True, zero_division=0.0))

In [301]:
stacking_scores[0].keys()

dict_keys(['censys', 'driftnet', 'internetcensus', 'intrinsec', 'mirai', 'netsystems', 'onyphe', 'securitytrails', 'shadowserver', 'shodan', 'unk_bruteforcer', 'unk_exploiter', 'unk_spammer', 'micro avg', 'macro avg', 'weighted avg'])

# Kruskal-Wallis test & PostHoc Test

In [302]:
def kruskal_posthoc(strat_points, alpha, target, strategies):

    strat_labels = strategies.copy()
    strat_labels.append("stacking")
    _, p_value = stats.kruskal(*strat_points)

    if p_value < alpha:

        # Perform post-hoc tests to identify which models differ from each other
        posthoc = sp.posthoc_dunn(strat_points)
        
        # Based on the post-hoc results, you can identify the best-performing model
        best_idx = None
        best_model_mean_score = 0
        for i, scores in enumerate(strat_points):
            mean_score = sum(scores) / len(scores)
            if mean_score > best_model_mean_score:
                best_idx = i
                best_model_mean_score = mean_score
        # If the bigger mean model is statistically significant different from the others.
        if np.sum(posthoc.values[best_idx] >= alpha) == 1:
            best_model = strat_labels[best_idx].upper()
            print(f"{target.upper()}.\tThe Kruskal-Wallis test is statistically significant, indicating differences among the models. The best-performing model is: {best_model}")
            return posthoc, best_model
        return posthoc, "TIE"
    
    else:
        #print("The Kruskal-Wallis test is not statistically significant, suggesting no significant differences among the models.")
        print(f"{target.upper()}.\tThe Kruskal-Wallis test statistically significant, suggesting no significant differences among the models.")
        return None, None


In [303]:
alpha = 0.05

In [304]:

cols = ['mirai',
    'unk_bruteforcer',
    'unk_spammer',
    'shadowserver',
    'driftnet',
    'internetcensus',
    'censys',
    'rapid7',
    'onyphe',
    'netsystems',
    'shodan',
    'unk_exploiter',
    'securitytrails',
    'intrinsec',
    'macro avg']

In [305]:
posthocs, best_model = [], []
for label in cols:
    strat_points = [ [ point[label]["f1-score"] for point in strat_scores[strat] if label in point ]
        for strat in strategies ]    
    strat_points.append([ point[label]["f1-score"] for point in stacking_scores if label in point ])
    p, b = kruskal_posthoc(strat_points, alpha, label, strategies)
    posthocs.append(p)
    best_model.append(b)

UNK_BRUTEFORCER.	The Kruskal-Wallis test is statistically significant, indicating differences among the models. The best-performing model is: STACKING
UNK_SPAMMER.	The Kruskal-Wallis test is statistically significant, indicating differences among the models. The best-performing model is: STACKING
SHADOWSERVER.	The Kruskal-Wallis test is statistically significant, indicating differences among the models. The best-performing model is: STACKING
DRIFTNET.	The Kruskal-Wallis test is statistically significant, indicating differences among the models. The best-performing model is: STACKING
INTRINSEC.	The Kruskal-Wallis test is statistically significant, indicating differences among the models. The best-performing model is: STACKING
MACRO AVG.	The Kruskal-Wallis test is statistically significant, indicating differences among the models. The best-performing model is: STACKING


In [306]:
full_scores = strat_scores.copy()
full_scores["stacking"] = stacking_scores.copy()

In [307]:
strat_means = {}
for strat in full_scores:
    strat_means[strat] = []
    for label in cols:
        m = np.mean(
            [ point[label]["f1-score"] for point in full_scores[strat] if label in point ]
        )
        strat_means[strat].append(np.around(m, decimals=4))



In [308]:
f1_table = pd.DataFrame(strat_means)

In [309]:
f1_table.insert(loc=0, column="Labels", value=cols)
f1_table["BestModel"] = best_model
f1_table

Unnamed: 0,Labels,gcn,igcn,gcngru,igcngru,gcn_features,igcn_features,gcngru_features,igcngru_features,idarkvec,stacking,BestModel
0,mirai,0.6076,0.7575,0.6591,0.7132,0.9775,0.9777,0.8525,0.9785,0.9809,0.9822,TIE
1,unk_bruteforcer,0.0808,0.5297,0.5321,0.533,0.5881,0.58,0.5978,0.6238,0.5903,0.6538,STACKING
2,unk_spammer,0.1161,0.1863,0.2937,0.1795,0.4164,0.4008,0.3624,0.4219,0.4065,0.5045,STACKING
3,shadowserver,0.1192,0.4926,0.6224,0.4735,0.8857,0.8845,0.9382,0.9447,0.9551,0.9883,STACKING
4,driftnet,0.7899,0.7137,0.859,0.8886,0.8838,0.8591,0.9666,0.9671,0.9681,0.9995,STACKING
5,internetcensus,0.238,0.2148,0.6241,0.4001,0.5524,0.4921,0.8857,0.9089,0.9915,0.9982,TIE
6,censys,0.6395,0.5984,0.6509,0.6843,0.8842,0.8769,0.9123,0.9143,0.9446,0.9351,TIE
7,rapid7,0.8959,0.7343,0.8955,0.8333,0.8443,0.8312,0.6952,0.9624,1.0,0.9991,TIE
8,onyphe,0.0267,0.0104,0.0495,0.0264,0.911,0.9114,0.9319,0.9186,0.9703,0.9848,TIE
9,netsystems,0.0,0.0343,0.1309,0.018,0.9471,0.9165,0.7668,0.9231,0.4483,0.9673,TIE


In [310]:
values = np.arange(1, 22)
values

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21])

In [311]:
g1 = values[:10]
g1

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [312]:
g2 = values[10:]
g2

array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21])

In [313]:
(np.mean(g1) + np.mean(g2))/2

10.75

In [314]:
np.mean(values)

11.0