In [15]:
import json
import numpy as np
import pandas as pd
from glob import glob
from sklearn.metrics import classification_report, f1_score

In [16]:
strats = ["igcngru_features", "idarkvec", "features"]
SOURCES = ["darknet", "honeypot"]
TARGET_SOURCE = "darknet"

In [17]:
data_dir = f"../data/2022/input/stacking_predictions/out/k3/{TARGET_SOURCE}/test"
days = [ f.split('/')[-1].split('_')[1] for f in glob(f"{data_dir}/idarkvec_*_fold00.csv")]
days.sort()
days

['20221021',
 '20221022',
 '20221023',
 '20221024',
 '20221025',
 '20221026',
 '20221027',
 '20221028',
 '20221029',
 '20221030',
 '20221031']

In [18]:
with open("../data/2022/input/skf/stratification/stratification.json", 'r') as fd:
    splits = json.load(fd)

In [19]:
df = pd.read_csv(f"../data/2022/input/stacking_predictions/out/k3/{TARGET_SOURCE}/test/idarkvec_20221021_fold00.csv")

In [20]:
df = df[df.y_true != "unknown"].copy()

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,src_ip,censys,driftnet,internetcensus,intrinsec,ipip,mirai,onyphe,rapid7,securitytrails,shadowserver,shodan,u_mich,unk_bruteforcer,unk_exploiter,unk_spammer,unknown,y_true
0,12982,167.94.138.102,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,censys
1,12983,167.94.138.146,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,censys
2,12984,118.40.8.149,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
3,12985,103.126.245.10,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
4,12986,193.142.146.35,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,unk_bruteforcer


In [22]:
prob_cols = [
    "censys",
    "driftnet",
    "internetcensus",
    "intrinsec",
    "ipip",
    "mirai",
    "onyphe",
    "rapid7",
    "securitytrails",
    "shadowserver",
    "shodan",
    "u_mich",
    "unk_bruteforcer",
    "unk_exploiter",
    "unk_spammer",
    "unknown"
]
prob_cols.sort()

In [23]:
def get_intersectin(splits, sources, day, fold):

    ip_set_1 = set(splits[day][sources[0]][fold][1])
    ip_set_2 = set(splits[day][sources[1]][fold][1])
    return list(ip_set_1.intersection(ip_set_2))

In [24]:
metrics = {}
for strat in strats:
    metrics[strat] = {}
    for day in days:
        y, preds = [], []
        for fold in np.arange(10):
            fpath = f"{data_dir}/{strat}_{day}_fold0{fold}.csv"
            print(fpath)
            df = pd.read_csv(fpath)
            # Getting the intersection.
            ipset = get_intersectin(splits, SOURCES, day, fold)
            df = df[df.src_ip.isin(ipset)]

            ps = df[prob_cols].values.argmax(axis=1)
            preds.append([prob_cols[c] for c in ps])

            y.append(df.y_true.values)
        
        preds = np.hstack(preds)
        y = np.hstack(y)
        
        report = classification_report(
            y,
            preds,
            labels=np.unique(y),
            output_dict=True,
            zero_division=0,
        )

        for c in prob_cols:
            if c in report:
                if c not in metrics[strat]:
                    metrics[strat][c] = []
                metrics[strat][c].append(report[c]["f1-score"])

../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold00.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold01.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold02.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold03.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold04.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold05.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold06.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold07.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold08.csv
../data/2022/input/stacking_predictions/out/k3/darknet/test/igcngru_features_20221021_fold09.csv
../data/2022/input/stacking_pr

In [25]:
scores = pd.DataFrame(metrics)
scores

Unnamed: 0,igcngru_features,idarkvec,features
censys,"[0.7486631016042781, 0.7650273224043717, 0.787...","[0.7126436781609196, 0.6706586826347305, 0.749...","[0.911764705882353, 0.9674418604651163, 0.9626..."
driftnet,"[0.97265625, 0.9764705882352942, 0.97821782178...","[0.9921259842519685, 0.9940828402366864, 0.994...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99604743..."
internetcensus,"[0.912280701754386, 0.9192825112107623, 0.9095...","[0.9777777777777777, 0.9771689497716896, 0.983...","[0.9299065420560747, 0.9716981132075471, 0.966..."
intrinsec,"[0.9523809523809523, 0.5714285714285715, 0.5, ...","[1.0, 0.7272727272727272, 0.9600000000000001, ...","[0.14285714285714288, 0.7058823529411764, 0.5,..."
ipip,"[1.0, 1.0, 0.9600000000000001, 1.0, 0.96000000...","[0.15384615384615385, 0.15384615384615385, 0.1...","[1.0, 1.0, 0.9090909090909091, 1.0, 1.0, 1.0, ..."
mirai,"[0.9974469326719673, 0.9975779816513761, 0.996...","[0.9931456905352194, 0.9939847417840375, 0.993...","[0.998469052999927, 0.9988266353769433, 0.9983..."
onyphe,"[0.9101123595505618, 0.8977272727272727, 0.928...","[0.9595959595959594, 0.9743589743589743, 0.994...","[0.9735449735449735, 0.9162011173184357, 0.951..."
securitytrails,"[0.9473684210526316, 0.9473684210526316, 0.9, ...","[0.972972972972973, 1.0, 1.0, 1.0, 1.0, 1.0, 1...","[0.9473684210526316, 0.9473684210526316, 1.0, ..."
shadowserver,"[0.6527777777777778, 0.665158371040724, 0.6438...","[0.9859649122807017, 0.9912739965095986, 0.996...","[0.6681818181818182, 0.6742081447963801, 0.662..."
shodan,"[0.55, 0.5263157894736842, 0.6976744186046512,...","[0.7727272727272727, 0.7727272727272727, 0.775...","[0.8846153846153846, 0.8461538461538461, 0.892..."


In [26]:
def mean(values):
    return np.trunc(np.mean(values) * 100) / 100
    
result_df = scores.applymap(mean)
macro = result_df.sort_index()
macro

Unnamed: 0,igcngru_features,idarkvec,features
censys,0.79,0.69,0.96
driftnet,0.97,0.99,0.99
internetcensus,0.9,0.98,0.93
intrinsec,0.72,0.89,0.46
ipip,0.98,0.06,0.98
mirai,0.99,0.99,0.99
onyphe,0.91,0.98,0.94
rapid7,0.97,0.99,0.98
securitytrails,0.92,0.99,0.96
shadowserver,0.66,0.99,0.66


In [27]:
for col in prob_cols:
    if col in macro.index:
        print(';'.join([ str(v) for v in macro.loc[col].values]))
    else:
        print("0.0;0.0;0.0")
    

0.79;0.69;0.96
0.97;0.99;0.99
0.9;0.98;0.93
0.72;0.89;0.46
0.98;0.06;0.98
0.99;0.99;0.99
0.91;0.98;0.94
0.97;0.99;0.98
0.92;0.99;0.96
0.66;0.99;0.66
0.67;0.76;0.87
0.98;0.98;0.6
0.53;0.48;0.53
0.45;0.21;0.41
0.75;0.8;0.81
0.0;0.0;0.0
