In [81]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
import pandas as pd
from joblib import dump
from os.path import join, dirname, realpath
import sys
import numpy as np

DATA_DIR = join("data")

PCAP = {"1": join(DATA_DIR, "mega104-17-12-18.pcapng"),
        "2": join(DATA_DIR, "10122018-104Mega.pcapng"),
        "3": join(DATA_DIR, "10122018-104Mega-anomaly.pcapng")}

CSV = {"1": join(DATA_DIR, "mega104-17-12-18.pcapng.csv"),
       "2": join(DATA_DIR, "10122018-104Mega.pcapng.csv"),
       "3": join(DATA_DIR, "10122018-104Mega-anomaly.pcapng.scv")}

In [114]:
def create_model(num=1, nu=0.0188):
    iec104 = pd.read_csv(CSV[str(num)], header=0, skipinitialspace=True)

    iec104 = iec104.drop(columns=["relative_time_stamp", "io_type"])
    x_train, x_test = train_test_split(iec104, train_size=2/3, test_size=1/3,
                                    shuffle=False, random_state=0)
    one_class_svm = OneClassSVM(nu=nu, kernel = 'rbf', gamma = 0.1).fit(x_train)
    dump(one_class_svm, f"{DATA_DIR}/one-class-svm-{num}.joblib")
    prediction = one_class_svm.predict(x_test)

    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size

    print(f"Nu is: {nu:.4f}")
    print(f"Total number of samples: {size}")
    print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
    print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
    return [nu, perc_anom, 1 - perc_anom]

In [115]:
nu = 0.02
create_model(1, nu)
create_model(2, nu)

Nu is: 0.0200
Total number of samples: 12554
Normal: 12294 (97.93%)
Anomalies: 260 (2.07%)
Nu is: 0.0200
Total number of samples: 22126
Normal: 17237 (77.90%)
Anomalies: 4889 (22.10%)


[0.02, 0.2209617644400253, 0.7790382355599748]

In [116]:
nu = 0.002
result_pd_1 = []
result_pd_2 = []
while nu < 0.031:
    entry = (nu, *create_model(1, nu=nu)[1:])
    result_pd_1.append(entry)
    entry = (nu, *create_model(2, nu=nu)[1:])
    result_pd_2.append(entry)
    nu += 0.002

df_1 = pd.DataFrame(result_pd_1, columns=["nu", "anomalies_1", "ok_1"])
df_2 = pd.DataFrame(result_pd_2, columns=["nu", "anomalies_2", "ok_2"])
df_1.to_csv("./data/pandas-df.csv")
df_2.to_csv("./data/pandas-df.csv")
df_1_sort = df_1.sort_values(by=['ok_1'], ascending=False).reset_index(drop=True)
df_2_sort = df_2.sort_values(by=["ok_2"], ascending=False).reset_index(drop=True)

Nu is: 0.0020
Total number of samples: 12554
Normal: 11297 (89.99%)
Anomalies: 1257 (10.01%)
Nu is: 0.0020
Total number of samples: 22126
Normal: 16747 (75.69%)
Anomalies: 5379 (24.31%)
Nu is: 0.0040
Total number of samples: 12554
Normal: 12237 (97.47%)
Anomalies: 317 (2.53%)
Nu is: 0.0040
Total number of samples: 22126
Normal: 16215 (73.28%)
Anomalies: 5911 (26.72%)
Nu is: 0.0060
Total number of samples: 12554
Normal: 12146 (96.75%)
Anomalies: 408 (3.25%)
Nu is: 0.0060
Total number of samples: 22126
Normal: 16244 (73.42%)
Anomalies: 5882 (26.58%)
Nu is: 0.0080
Total number of samples: 12554
Normal: 12191 (97.11%)
Anomalies: 363 (2.89%)
Nu is: 0.0080
Total number of samples: 22126
Normal: 16932 (76.53%)
Anomalies: 5194 (23.47%)
Nu is: 0.0100
Total number of samples: 12554
Normal: 12402 (98.79%)
Anomalies: 152 (1.21%)
Nu is: 0.0100
Total number of samples: 22126
Normal: 20335 (91.91%)
Anomalies: 1791 (8.09%)
Nu is: 0.0120
Total number of samples: 12554
Normal: 12298 (97.96%)
Anomalies: 

In [113]:
df_1_sort = df_1.sort_values(by=['ok_1'], ascending=False).reset_index(drop=True)
df_2_sort = df_2.sort_values(by=["ok_2"], ascending=False).reset_index(drop=True)
print(df_1_sort)
print(df_2_sort)

       nu  anomalies_1      ok_1
0   0.004     0.178987  0.821013
1   0.020     0.188864  0.811136
2   0.014     0.191891  0.808109
3   0.016     0.197069  0.802931
4   0.012     0.198582  0.801418
5   0.026     0.200653  0.799347
6   0.008     0.204716  0.795284
7   0.018     0.205273  0.794727
8   0.010     0.214035  0.785965
9   0.022     0.217540  0.782460
10  0.030     0.221443  0.778557
11  0.024     0.227179  0.772821
12  0.028     0.229648  0.770352
13  0.002     0.239764  0.760236
14  0.006     0.321889  0.678111
       nu  anomalies_2      ok_2
0   0.020     0.015231  0.984769
1   0.006     0.021332  0.978668
2   0.030     0.025988  0.974012
3   0.022     0.032044  0.967956
4   0.008     0.048585  0.951415
5   0.012     0.081488  0.918512
6   0.018     0.090165  0.909835
7   0.028     0.092380  0.907620
8   0.014     0.094911  0.905089
9   0.002     0.105396  0.894604
10  0.024     0.189460  0.810540
11  0.010     0.210567  0.789433
12  0.016     0.222046  0.777954
13  0.004 

In [105]:
difs = []
for i in range(0,len(df_1_sort)):
    e_1 = df_1_sort.iloc[i]
    e_2 = df_2_sort.iloc[i]
    diff_anom = e_1['anomalies_1'] -e_2['anomalies_2']
    difs.append(abs(e_1['anomalies_1'] - e_2['anomalies_2']))
    
    

In [111]:

difs = np.array(difs)

print(difs)
# a = difs.min(axis=1)
a = np.where(difs == difs.min())



[0.16375583 0.16753174 0.1659035  0.16502491 0.14999675 0.11916534
 0.11455021 0.11289321 0.1191244  0.11214386 0.031983   0.01661183
 0.00760146 0.05599643 0.01351919]
(array([12]),)
