In [64]:
import pickle
import numpy as np
import pandas as pd
from itertools import product
from joblib import load
from sklearn.metrics import f1_score

In [65]:
TARGET_DAY = "20221029"

SCENARIOS = {
    1: {"DATA_SOURCES": ["darknet", "honeypot"]},
    2: {"DATA_SOURCES": ["darknet"]},
    3: {"DATA_SOURCES": ["honeypot"]},
    4: {"DATA_SOURCES": ["darknet"]},
    5: {"DATA_SOURCES": ["honeypot"]},
}

stacking_strategies = ["igcngru_features", "idarkvec", "features"]
stacking_strategies.sort()
STRATS_POSFIX = "-".join(stacking_strategies)

# Lista com todas as classes.
probs_cols = [
    "censys",
    "driftnet",
    "internetcensus",
    "intrinsec",
    "ipip",
    "mirai",
    "onyphe",
    "rapid7",
    "securitytrails",
    "shadowserver",
    "shodan",
    "u_mich",
    "unk_bruteforcer",
    "unk_exploiter",
    "unk_spammer",
    "unknown"
]

num_labels = len(probs_cols)
probs_cols.sort()

In [66]:
probs_cols

['censys',
 'driftnet',
 'internetcensus',
 'intrinsec',
 'ipip',
 'mirai',
 'onyphe',
 'rapid7',
 'securitytrails',
 'shadowserver',
 'shodan',
 'u_mich',
 'unk_bruteforcer',
 'unk_exploiter',
 'unk_spammer',
 'unknown']

In [67]:
label_to_idx = { l:idx for idx, l in enumerate(probs_cols) }
label_to_idx

{'censys': 0,
 'driftnet': 1,
 'internetcensus': 2,
 'intrinsec': 3,
 'ipip': 4,
 'mirai': 5,
 'onyphe': 6,
 'rapid7': 7,
 'securitytrails': 8,
 'shadowserver': 9,
 'shodan': 10,
 'u_mich': 11,
 'unk_bruteforcer': 12,
 'unk_exploiter': 13,
 'unk_spammer': 14,
 'unknown': 15}

In [68]:
data = np.load(f"data/2022/output/darknet-honeypot/features-idarkvec-igcngru_features/1/stacking_data/{TARGET_DAY}/0/data.npz", allow_pickle=True)
model = load(f"data/2022/output/darknet-honeypot/features-idarkvec-igcngru_features/1/stacking_data/{TARGET_DAY}/0/lr.joblib")

In [69]:
model.classes_

array(['censys', 'driftnet', 'internetcensus', 'intrinsec', 'ipip',
       'mirai', 'rapid7', 'securitytrails', 'shadowserver', 'shodan',
       'u_mich', 'unk_bruteforcer', 'unk_exploiter', 'unk_spammer',
       'unknown'], dtype=object)

In [70]:
list(data.keys())

['X_train', 'X_test', 'y_train', 'y_test', 'probas', 'us_idxs']

In [71]:
data["X_test"].shape

(721, 96)

In [72]:
list(product(SCENARIOS[1]["DATA_SOURCES"], stacking_strategies))

[('darknet', 'features'),
 ('darknet', 'idarkvec'),
 ('darknet', 'igcngru_features'),
 ('honeypot', 'features'),
 ('honeypot', 'idarkvec'),
 ('honeypot', 'igcngru_features')]

In [73]:
m = np.random.randint(0, 5, size=(4, 9))
m

array([[2, 3, 2, 3, 0, 4, 2, 1, 4],
       [4, 3, 0, 2, 0, 2, 2, 2, 0],
       [1, 4, 1, 1, 4, 4, 1, 4, 3],
       [0, 2, 3, 3, 0, 0, 2, 4, 3]])

In [74]:
m[:, 0:3], m[:, 3:6], m[:, 6:9]

(array([[2, 3, 2],
        [4, 3, 0],
        [1, 4, 1],
        [0, 2, 3]]),
 array([[3, 0, 4],
        [2, 0, 2],
        [1, 4, 4],
        [3, 0, 0]]),
 array([[2, 1, 4],
        [2, 2, 0],
        [1, 4, 3],
        [2, 4, 3]]))

In [75]:
def models_miss(data, label_to_idx, model, num_labels, sources, strategies):

    st = data["probas"]

    base_models_preds = {}
    for i, (source, strat) in enumerate(product(sources, strategies)):
        base_models_preds[f"{source}-{strat}"] = data["X_test"][: , i * num_labels : (i + 1) * num_labels].argmax(axis=1)

    # Ajustando os índices das predições do stacking.
    # Com o ajuste podemos indexar a lista de classes completas
    # para extrair as predições do stacking.
    pst = st.argmax(axis=1)
    pst = [ model.classes_[p] for p in pst ]
    pst = np.array([ label_to_idx[p] for p in pst ])

    y = np.array([ label_to_idx[l] for l in data["y_test"] ])


    macros = ' - '.join([ f"{key}: {f1_score(y, base_models_preds[key], average='macro', labels=np.unique(y))}"
        for key in base_models_preds ])

    print(f"Macros: {macros}")

    base = None
    for key in base_models_preds:

        if base is None:
            base = base_models_preds[key] != y
        else:
            base = base & (base_models_preds[key] != y)

    cst = pst == y

    return base & cst

In [76]:
win_backs = models_miss(data, label_to_idx, model, num_labels, SCENARIOS[1]["DATA_SOURCES"], stacking_strategies)

Macros: darknet-features: 0.8736863664077485 - darknet-idarkvec: 0.8029123586484652 - darknet-igcngru_features: 0.8633535908308075 - honeypot-features: 0.7491893897837353 - honeypot-idarkvec: 0.9592981920321239 - honeypot-igcngru_features: 0.8436777250371058


In [77]:
hits = {}
for case in [1, 2, 3, 4, 5]:
    hits[case] = {}
    case_sources = SCENARIOS[case]["DATA_SOURCES"]
    case_sources_sufix = '-'.join(case_sources)
    for fold in np.arange(10):
        hits[case][fold] = {}
        base_dir = f"data/2022/output/{case_sources_sufix}/{STRATS_POSFIX}/{case}/stacking_data/{TARGET_DAY}/{fold}"
        data = np.load(f"{base_dir}/data.npz", allow_pickle=True)
        model = load(f"{base_dir}/lr.joblib")
        idxs = models_miss(
            data,
            label_to_idx,
            model,
            num_labels,
            case_sources,
            stacking_strategies,
        )
        hits[case][fold]["X"] = data["X_test"][idxs]
        hits[case][fold]["y"] = data["y_test"][idxs]
        hits[case][fold]["idxs"] = np.arange(data["y_test"].shape[0])[idxs]
        hits[case][fold]["stacking_probas"] = data["probas"]
        hits[case][fold]["columns_labels"] = [ f"{s}-{st}-{c}" for s, st, c in product(case_sources, stacking_strategies, probs_cols) ]
        hits[case][fold]["model"] = model

Macros: darknet-features: 0.8736863664077485 - darknet-idarkvec: 0.8029123586484652 - darknet-igcngru_features: 0.8633535908308075 - honeypot-features: 0.7491893897837353 - honeypot-idarkvec: 0.9592981920321239 - honeypot-igcngru_features: 0.8436777250371058
Macros: darknet-features: 0.9255995893755141 - darknet-idarkvec: 0.8318751590554173 - darknet-igcngru_features: 0.880086403248168 - honeypot-features: 0.8290008139419326 - honeypot-idarkvec: 0.8558775543428809 - honeypot-igcngru_features: 0.8079657115371401
Macros: darknet-features: 0.8004948662807915 - darknet-idarkvec: 0.7754998008848801 - darknet-igcngru_features: 0.8749598853802777 - honeypot-features: 0.6434291177810175 - honeypot-idarkvec: 0.829351498071056 - honeypot-igcngru_features: 0.7500236354816222
Macros: darknet-features: 0.8180776522605607 - darknet-idarkvec: 0.7457629822205715 - darknet-igcngru_features: 0.8201776243714702 - honeypot-features: 0.6853186517407405 - honeypot-idarkvec: 0.8621734184138036 - honeypot-igc

In [78]:
hits[1][1]

{'X': array([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.33333333, 0.        , 0.        , 0.        , 0.33333333,
         0.33333333, 0.        , 0.33333333, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.33333333, 0.        , 0.        , 0.        ,
         0.33333333, 0.        , 0.66666667, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.33333333, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.33333333, 0.        , 0.        , 0.        ,
         0.        , 0.      

# Teste de sanidade.

In [79]:
hits[1][1]["idxs"]

array([21])

In [80]:
hits[1][1]["y"]

array(['unk_spammer'], dtype=object)

In [82]:
TEST_IDX = 21
LABEL = hits[1][1]["y"][np.where(hits[1][1]["idxs"] == TEST_IDX)[0][0]]
LABEL

'unk_spammer'

In [83]:
pred_model_idarkvec = hits[1][1]["X"][0][:num_labels].argmax(axis=0)
pred_model_idarkvec, probs_cols[pred_model_idarkvec]

(10, 'shodan')

In [84]:
pred_model_igcngru = hits[1][1]["X"][0][num_labels:num_labels*2].argmax(axis=0)
pred_model_igcngru, probs_cols[pred_model_igcngru]

(1, 'driftnet')

In [85]:
pred_model_igcngru = hits[1][1]["X"][0][num_labels * 2 : num_labels * 3].argmax(axis=0)
pred_model_igcngru, probs_cols[pred_model_igcngru]

(0, 'censys')

In [86]:
hits[1][1]["stacking_probas"][TEST_IDX].shape

(15,)

In [87]:
hits[1][1]["model"].classes_[hits[1][1]["stacking_probas"][TEST_IDX].argmax()]

'unk_spammer'

In [88]:
with open("filtered_preds.pkl", 'wb') as fd:
    pickle.dump(hits, fd)

In [89]:
"X"
"y"
"idxs"
"stacking_probas"
"columns_labels"
"model"

'model'

In [90]:
with open("filtered_preds.pkl", "rb") as fd:
    data = pickle.load(fd)
data

{1: {0: {'X': array([], shape=(0, 96), dtype=float64),
   'y': array([], dtype=object),
   'idxs': array([], dtype=int64),
   'stacking_probas': array([[2.71013485e-04, 3.28981019e-04, 3.04158789e-04, ...,
           1.05855205e-04, 2.66451010e-04, 3.28291675e-03],
          [1.84011744e-04, 2.15591563e-04, 2.13805503e-04, ...,
           5.22614504e-05, 1.30817570e-04, 1.36764827e-03],
          [1.84011744e-04, 2.15591563e-04, 2.13805503e-04, ...,
           5.22614504e-05, 1.30817570e-04, 1.36764827e-03],
          ...,
          [1.84011744e-04, 2.15591563e-04, 2.13805503e-04, ...,
           5.22614504e-05, 1.30817570e-04, 1.36764827e-03],
          [1.84011744e-04, 2.15591563e-04, 2.13805503e-04, ...,
           5.22614504e-05, 1.30817570e-04, 1.36764827e-03],
          [1.84011744e-04, 2.15591563e-04, 2.13805503e-04, ...,
           5.22614504e-05, 1.30817570e-04, 1.36764827e-03]]),
   'columns_labels': ['darknet-features-censys',
    'darknet-features-driftnet',
    'darknet-fe

In [91]:
data[1].keys(), data[1][1].keys()

(dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 dict_keys(['X', 'y', 'idxs', 'stacking_probas', 'columns_labels', 'model']))

In [92]:
data[2][0]["columns_labels"]

['darknet-features-censys',
 'darknet-features-driftnet',
 'darknet-features-internetcensus',
 'darknet-features-intrinsec',
 'darknet-features-ipip',
 'darknet-features-mirai',
 'darknet-features-onyphe',
 'darknet-features-rapid7',
 'darknet-features-securitytrails',
 'darknet-features-shadowserver',
 'darknet-features-shodan',
 'darknet-features-u_mich',
 'darknet-features-unk_bruteforcer',
 'darknet-features-unk_exploiter',
 'darknet-features-unk_spammer',
 'darknet-features-unknown',
 'darknet-idarkvec-censys',
 'darknet-idarkvec-driftnet',
 'darknet-idarkvec-internetcensus',
 'darknet-idarkvec-intrinsec',
 'darknet-idarkvec-ipip',
 'darknet-idarkvec-mirai',
 'darknet-idarkvec-onyphe',
 'darknet-idarkvec-rapid7',
 'darknet-idarkvec-securitytrails',
 'darknet-idarkvec-shadowserver',
 'darknet-idarkvec-shodan',
 'darknet-idarkvec-u_mich',
 'darknet-idarkvec-unk_bruteforcer',
 'darknet-idarkvec-unk_exploiter',
 'darknet-idarkvec-unk_spammer',
 'darknet-idarkvec-unknown',
 'darknet-ig

In [93]:
data.keys(), data[1].keys(), data[1][1].keys()

(dict_keys([1, 2, 3, 4, 5]),
 dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 dict_keys(['X', 'y', 'idxs', 'stacking_probas', 'columns_labels', 'model']))