In [4]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
import pandas as pd
from joblib import dump
from os.path import join, dirname, realpath
import sys

DATA_DIR = join("data")

PCAP = {"1": join(DATA_DIR, "mega104-17-12-18.pcapng"),
        "2": join(DATA_DIR, "10122018-104Mega.pcapng"),
        "3": join(DATA_DIR, "10122018-104Mega-anomaly.pcapng")}

CSV = {"1": join(DATA_DIR, "mega104-17-12-18.pcapng.csv"),
       "2": join(DATA_DIR, "10122018-104Mega.pcapng.csv"),
       "3": join(DATA_DIR, "10122018-104Mega-anomaly.pcapng.scv")}

In [14]:
def create_model(num=1, nu=0.0188):
    iec104 = pd.read_csv(CSV[str(num)], header=0, skipinitialspace=True)

    iec104 = iec104.drop(columns=["relative_time_stamp"])
    x_train, x_test = train_test_split(iec104, train_size=2/3, test_size=1/3,
                                    shuffle=False, random_state=0)
    one_class_svm = OneClassSVM(nu=nu, kernel = 'rbf', gamma = 0.1).fit(x_train)
    dump(one_class_svm, f"{DATA_DIR}/one-class-svm.joblib")
    prediction = one_class_svm.predict(x_test)

    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size

    print(f"Nu is: {nu:.4f}")
    print(f"Total number of samples: {size}")
    print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
    print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
    return [nu, perc_anom, 1 - perc_anom]

In [17]:
create_model(1)
create_model(2)

Nu is: 0.0188
Total number of samples: 12554
Normal: 12344 (98.33%)
Anomalies: 210 (1.67%)
Nu is: 0.0188
Total number of samples: 22126
Normal: 15500 (70.05%)
Anomalies: 6626 (29.95%)


[0.0188, 0.2994666907710386, 0.7005333092289614]

In [24]:
nu = 0.002
result_pd_1 = []
result_pd_2 = []
while nu < 0.031:
    entry = (nu, *create_model(1, nu=nu)[1:])
    result_pd_1.append(entry)
    entry = (nu, *create_model(2, nu=nu)[1:])
    result_pd_2.append(entry)
    nu += 0.002

df_1 = pd.DataFrame(result_pd_1, columns=["nu", "anomalies_1", "ok_1"])
df_2 = pd.DataFrame(result_pd_2, columns=["nu", "anomalies_2", "ok_2"])
df_1.to_csv("./data/pandas-df.csv")
df_2.to_csv("./data/pandas-df.csv")


Nu is: 0.002
Total number of samples: 12554
Normal: 11530 (91.84%)
Anomalies: 1024 (8.16%)
Nu is: 0.002
Total number of samples: 22126
Normal: 17333 (78.34%)
Anomalies: 4793 (21.66%)
Nu is: 0.004
Total number of samples: 12554
Normal: 11970 (95.35%)
Anomalies: 584 (4.65%)
Nu is: 0.004
Total number of samples: 22126
Normal: 18579 (83.97%)
Anomalies: 3547 (16.03%)
Nu is: 0.006
Total number of samples: 12554
Normal: 11999 (95.58%)
Anomalies: 555 (4.42%)
Nu is: 0.006
Total number of samples: 22126
Normal: 20548 (92.87%)
Anomalies: 1578 (7.13%)
Nu is: 0.008
Total number of samples: 12554
Normal: 12012 (95.68%)
Anomalies: 542 (4.32%)
Nu is: 0.008
Total number of samples: 22126
Normal: 16386 (74.06%)
Anomalies: 5740 (25.94%)
Nu is: 0.01
Total number of samples: 12554
Normal: 12025 (95.79%)
Anomalies: 529 (4.21%)
Nu is: 0.01
Total number of samples: 22126
Normal: 21824 (98.64%)
Anomalies: 302 (1.36%)
Nu is: 0.012
Total number of samples: 12554
Normal: 11913 (94.89%)
Anomalies: 641 (5.11%)
Nu i

In [39]:
df_1_sort = df_1.sort_values(by=['ok_1'], ascending=False).reset_index(drop=True)
df_2_sort = df_2.sort_values(by=["ok_2"], ascending=False).reset_index(drop=True)

In [51]:
for i in range(0,len(df_1_sort)):
    e_1 = df_1_sort.iloc[i]
    e_2 = df_2_sort.iloc[i]    
    if e_1["nu"] == e_2["nu"]
    print(e_1["nu"])
    

0.016
0.020000000000000004
0.022000000000000006
0.024000000000000007
0.02800000000000001
0.014
0.030000000000000013
0.018000000000000002
0.02600000000000001
0.01
0.008
0.006
0.004
0.012
0.002
