In [29]:
import json
import numpy as np
import pandas as pd
from glob import glob
from sklearn.metrics import classification_report, f1_score

In [30]:
strats = ["igcngru_features", "idarkvec", "features"]
SOURCES = ["darknet", "honeypot"]
TARGET_SOURCE = "honeypot"

In [31]:
data_dir = f"../data/2022/input/stacking_predictions/out/k3/{TARGET_SOURCE}/test"
days = [ f.split('/')[-1].split('_')[1] for f in glob(f"{data_dir}/idarkvec_*_fold00.csv")]
days.sort()
days

['20221021',
 '20221022',
 '20221023',
 '20221024',
 '20221025',
 '20221026',
 '20221027',
 '20221028',
 '20221029',
 '20221030',
 '20221031']

In [32]:
with open("../data/2022/input/skf/stratification/stratification.json", 'r') as fd:
    splits = json.load(fd)

In [33]:
df = pd.read_csv(f"../data/2022/input/stacking_predictions/out/k3/{TARGET_SOURCE}/test/idarkvec_20221021_fold00.csv")

In [34]:
df = df[df.y_true != "unknown"].copy()

In [35]:
df.head()

Unnamed: 0.1,Unnamed: 0,src_ip,censys,driftnet,internetcensus,intrinsec,ipip,mirai,onyphe,rapid7,securitytrails,shadowserver,shodan,u_mich,unk_bruteforcer,unk_exploiter,unk_spammer,unknown,y_true
0,17639,117.187.173.104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.666667,unk_spammer
1,17640,222.185.146.149,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai
2,17641,117.196.109.167,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,mirai
3,17642,170.106.173.40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,unk_spammer
4,17643,123.130.210.226,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,mirai


In [36]:
prob_cols = [
    "censys",
    "driftnet",
    "internetcensus",
    "intrinsec",
    "ipip",
    "mirai",
    "onyphe",
    "rapid7",
    "securitytrails",
    "shadowserver",
    "shodan",
    "u_mich",
    "unk_bruteforcer",
    "unk_exploiter",
    "unk_spammer",
    "unknown"
]
prob_cols.sort()

In [37]:
def get_intersectin(splits, sources, day, fold):

    ip_set_1 = set(splits[day][sources[0]][fold][1])
    ip_set_2 = set(splits[day][sources[1]][fold][1])
    return list(ip_set_1.intersection(ip_set_2))

In [38]:
metrics = {}
for strat in strats:
    metrics[strat] = {}
    for day in days:
        y, preds = [], []
        for fold in np.arange(10):
            fpath = f"{data_dir}/{strat}_{day}_fold0{fold}.csv"
            print(fpath)
            df = pd.read_csv(fpath)
            # Getting the intersection.
            ipset = get_intersectin(splits, SOURCES, day, fold)
            df = df[df.src_ip.isin(ipset)]

            ps = df[prob_cols].values.argmax(axis=1)
            preds.append([prob_cols[c] for c in ps])

            y.append(df.y_true.values)
        
        preds = np.hstack(preds)
        y = np.hstack(y)
        
        report = classification_report(
            y,
            preds,
            labels=np.unique(y),
            output_dict=True,
            zero_division=0,
        )

        for c in prob_cols:
            if c in report:
                if c not in metrics[strat]:
                    metrics[strat][c] = []
                metrics[strat][c].append(report[c]["f1-score"])

../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold00.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold01.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold02.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold03.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold04.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold05.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold06.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold07.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold08.csv
../data/2022/input/stacking_predictions/out/k3/honeypot/test/igcngru_features_20221021_fold09.csv
../data/2022/input/s

In [39]:
scores = pd.DataFrame(metrics)
scores

Unnamed: 0,igcngru_features,idarkvec,features
censys,"[0.6129032258064517, 0.6557377049180327, 0.702...","[0.9371980676328503, 0.9423076923076923, 0.952...","[0.8, 0.8148148148148148, 0.7634408602150539, ..."
driftnet,"[0.8212389380530972, 0.867513611615245, 0.8633...","[0.9760956175298804, 0.9656565656565657, 0.981...","[0.9683794466403162, 0.988235294117647, 0.9590..."
internetcensus,"[0.7574257425742574, 0.7769784172661871, 0.764...","[0.973568281938326, 0.9771689497716896, 0.9793...","[0.9248291571753987, 0.9306930693069306, 0.909..."
intrinsec,"[0.7200000000000001, 0.4, 0.4615384615384615, ...","[0.9565217391304348, 0.8695652173913044, 0.960...","[0.3076923076923077, 0.25, 0.0, 0.533333333333..."
ipip,"[0.611111111111111, 0.631578947368421, 0.66666...","[0.15384615384615385, 0.15384615384615385, 0.4...","[0.0, 0.0, 0.0, 0.15384615384615385, 0.0, 0.23..."
mirai,"[0.9863900537041125, 0.9877967605946305, 0.987...","[0.9942306287884319, 0.9945564219508607, 0.993...","[0.9750351877916884, 0.9590058346593923, 0.970..."
onyphe,"[0.8402366863905325, 0.8786127167630058, 0.911...","[0.9696969696969696, 0.964824120603015, 0.9696...","[0.8850574712643678, 0.8977272727272727, 0.903..."
securitytrails,"[0.9473684210526316, 0.972972972972973, 0.9473...","[0.972972972972973, 1.0, 0.972972972972973, 1....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]"
shadowserver,"[0.6605504587155964, 0.6374133949191687, 0.712...","[0.9982668977469671, 1.0, 0.9982668977469671, ...","[0.7123893805309733, 0.6666666666666666, 0.684..."
shodan,"[0.6046511627906976, 0.43243243243243246, 0.62...","[0.6341463414634146, 0.7555555555555555, 0.711...","[0.8, 0.830188679245283, 0.8148148148148148, 0..."


In [40]:
def mean(values):
    return np.trunc(np.mean(values) * 100) / 100
    
result_df = scores.applymap(mean)
macro = result_df.sort_index()
macro

Unnamed: 0,igcngru_features,idarkvec,features
censys,0.71,0.94,0.8
driftnet,0.87,0.97,0.95
internetcensus,0.79,0.98,0.89
intrinsec,0.58,0.91,0.29
ipip,0.66,0.31,0.15
mirai,0.98,0.99,0.96
onyphe,0.9,0.97,0.88
rapid7,0.95,0.99,0.9
securitytrails,0.95,0.99,1.0
shadowserver,0.67,0.99,0.67


In [41]:
for col in prob_cols:
    if col in macro.index:
        print(';'.join([ str(v) for v in macro.loc[col].values]))
    else:
        print("0.0;0.0;0.0")
    

0.71;0.94;0.8
0.87;0.97;0.95
0.79;0.98;0.89
0.58;0.91;0.29
0.66;0.31;0.15
0.98;0.99;0.96
0.9;0.97;0.88
0.95;0.99;0.9
0.95;0.99;1.0
0.67;0.99;0.67
0.67;0.78;0.82
0.51;0.98;0.0
0.58;0.68;0.61
0.55;0.3;0.64
0.7;0.79;0.74
0.0;0.0;0.0
