In [12]:
import dask.dataframe as dd
from dask_ml.model_selection import KFold
from dask_ml.cluster import KMeans
from dask_ml.metrics import accuracy_score

In [13]:
mirai_devices = ['danmini_doorbell', 'ecobee_thermostat', 'philips_B120N10_baby_monitor', 
            'provision_PT_737E_security_camera', 'provision_PT_838_security_camera',
            'simplehome_XCS_1002_WHT_security_camera', 'simplehome_XCS_1003_WHT_security_camera']

benign = ['danmini_doorbell', 'ecobee_thermostat', 'ennio_doorbell', 'philips_B120N10_baby_monitor', 
            'provision_PT_737E_security_camera', 'provision_PT_838_security_camera', 'samsung_SNH_1011_N_webcam',
            'simplehome_XCS_1002_WHT_security_camera', 'simplehome_XCS_1003_WHT_security_camera']

mirai_attacks = ['ack', # automatic scan for vulnerable devices
                 'scan', # ack flood
                 'syn', # syn flood
                 'udp', # udp flood
                 'udpplain'] # optimized udp flood

In [14]:
first = True
mirai_data = None
for device in mirai_devices:
    for i in range(len(mirai_attacks)):
        attack = mirai_attacks[i]
        if first:
            mirai_data = dd.read_csv(f'../data/n_balo_t/{device}/mirai_attacks/{attack}.csv')
            mirai_data['label'] = i
            first = False
        else:
            to_add = dd.read_csv(f'../data/n_balo_t/{device}/mirai_attacks/{attack}.csv')
            to_add['label'] = i
            mirai_data = mirai_data.append(other=to_add)
        

first = True
benign_data = None
for device in benign:
    if first:
        benign_data = dd.read_csv(f'../data/n_balo_t/{device}/benign_traffic.csv')
        first = False
    else:
        to_add = dd.read_csv(f'../data/n_balo_t/{device}/benign_traffic.csv')
        benign_data = benign_data.append(other=to_add)
benign_data['label'] = 100
mirai_data = mirai_data.append(other=benign_data)
mirai_data_one, mirai_data_two = mirai_data.random_split([0.5, 0.5], shuffle=True)
data = mirai_data_one.append(mirai_data_two)
big_sample, small_sample = data.random_split([0.85, 0.15])

In [None]:
cv = KFold(n_splits=10, shuffle=True)
total = 0
fold_predictions = [] # (predictions, labels)
clusters = []
data = small_sample.to_dask_array(lengths=True)
for train_i, test_i in cv.split(data):
    X_train = data[train_i][ : , :-1]
    y_train = data[train_i][ : , -1]
    X_test = data[test_i][ : , :-1]
    y_test = data[test_i][ : , -1]
    
    model = KMeans(n_clusters=5, init='k-means||')
    model.fit(X_train)
    y_pred = model.predict(X_test)
    
    fold_predictions.append((y_pred, y_test))
    clusters = model.cluster_centers_
    
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    total += accuracy
print(total/10)