In [11]:
import numpy as np
import pandas as pd
from glob import glob
from sklearn.metrics import classification_report, f1_score

In [12]:
strats = ["igcngru_features", "idarkvec", "features"]
DATA_SOURCE = "honeypot"

In [13]:
data_dir = f"../data/2022/input/stacking_predictions/out/k3/{DATA_SOURCE}/test"
days = [ f.split('/')[-1].split('_')[1] for f in glob(f"{data_dir}/idarkvec_*_fold00.csv")]
days.sort()
days

['20221021',
 '20221022',
 '20221023',
 '20221024',
 '20221025',
 '20221026',
 '20221027',
 '20221028',
 '20221029',
 '20221030',
 '20221031']

In [14]:
df = pd.read_csv(f"../data/2022/input/stacking_predictions/out/k3/{DATA_SOURCE}/test/idarkvec_20221021_fold00.csv")

In [15]:
df = df[df.y_true != "unknown"].copy()

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,src_ip,censys,driftnet,internetcensus,intrinsec,ipip,mirai,onyphe,rapid7,securitytrails,shadowserver,shodan,u_mich,unk_bruteforcer,unk_exploiter,unk_spammer,unknown,y_true
0,17639,117.187.173.104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.666667,unk_spammer
1,17640,222.185.146.149,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
2,17641,117.196.109.167,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,mirai
3,17642,170.106.173.40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,unk_spammer
4,17643,123.130.210.226,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai


In [17]:
prob_cols = [
    "censys",
    "driftnet",
    "internetcensus",
    "intrinsec",
    "ipip",
    "mirai",
    "onyphe",
    "rapid7",
    "securitytrails",
    "shadowserver",
    "shodan",
    "u_mich",
    "unk_bruteforcer",
    "unk_exploiter",
    "unk_spammer",
    "unknown"
]
prob_cols.sort()

In [18]:
metrics = {}
for strat in strats:
    metrics[strat] = {}
    for day in days:
        for fold in np.arange(10):
            df = pd.read_csv(f"{data_dir}/{strat}_{day}_fold0{fold}.csv")
            preds = df[prob_cols].values.argmax(axis=1)
            preds = [prob_cols[c] for c in preds]
            y_true = df.y_true.values
            report = classification_report(
                y_true,
                preds,
                labels=np.unique(y_true),
                output_dict=True,
                zero_division=0,
            )
            for c in prob_cols:
                if c in report:
                    if c not in metrics[strat]:
                        metrics[strat][c] = []
                    metrics[strat][c].append(report[c]["f1-score"])

In [19]:
scores = pd.DataFrame(metrics)
scores

Unnamed: 0,igcngru_features,idarkvec,features
censys,"[0.5454545454545455, 0.4761904761904762, 0.476...","[0.8799999999999999, 0.7826086956521738, 0.879...","[0.7692307692307693, 0.7500000000000001, 0.750..."
driftnet,"[0.7179487179487181, 0.72, 0.7272727272727272,...","[0.9577464788732395, 0.9577464788732395, 0.944...","[0.8857142857142857, 0.8529411764705883, 0.845..."
internetcensus,"[0.7142857142857143, 0.8372093023255814, 0.590...","[1.0, 0.9787234042553191, 1.0, 0.9387755102040...","[0.9545454545454545, 0.8749999999999999, 0.930..."
intrinsec,"[0.0, 0.6666666666666666, 0.6666666666666666, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, ..."
ipip,"[0.6666666666666666, 0.4, 0.8, 0.6666666666666...","[0.0, 0.0, 0.0, 0.6666666666666666, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
mirai,"[0.9816465652857892, 0.9785002621919244, 0.982...","[0.9880643487285936, 0.984919396775871, 0.9832...","[0.9741424802110819, 0.9763779527559054, 0.980..."
onyphe,"[0.6666666666666666, 0.8421052631578948, 0.705...","[0.9523809523809523, 1.0, 0.9523809523809523, ...","[0.8421052631578948, 0.7999999999999999, 0.736..."
securitytrails,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0, ...","[0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
shadowserver,"[0.6818181818181819, 0.6818181818181819, 0.571...","[1.0, 1.0, 0.9824561403508771, 1.0, 1.0, 1.0, ...","[0.7391304347826086, 0.7659574468085107, 0.723..."
shodan,"[0.8, 1.0, 0.5, 0.5, 0.0, 0.8, 0.4, 0.4, 0.8, ...","[0.4, 0.6666666666666666, 0.8, 0.0, 0.0, 1.0, ...","[0.8571428571428571, 1.0, 0.8, 0.5, 0.66666666..."


In [20]:
def mean(values):
    return np.trunc(np.mean(values) * 100) / 100
    
result_df = scores.applymap(mean)
result_df.sort_index()

Unnamed: 0,igcngru_features,idarkvec,features
censys,0.68,0.91,0.76
driftnet,0.79,0.96,0.92
internetcensus,0.75,0.97,0.89
intrinsec,0.37,0.79,0.12
ipip,0.63,0.22,0.08
mirai,0.98,0.98,0.96
onyphe,0.83,0.97,0.84
rapid7,0.91,0.99,0.9
securitytrails,0.96,0.99,1.0
shadowserver,0.67,0.99,0.67
