In [92]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
import pandas as pd
from joblib import dump
from os.path import join
import numpy as np
import pyshark
from traceback import print_exc
import csv
import concurrent.futures
from joblib import load
import nest_asyncio

DATA_DIR = join("data")
PCAP_D = join(DATA_DIR, "pcap")
CSV_D = join(DATA_DIR, "csv")
MODELS_D = join(DATA_DIR, "models")

PCAP = {"1": join(PCAP_D, "mega104-17-12-18.pcapng"),
        "2": join(PCAP_D, "10122018-104Mega.pcapng"),
        "3": join(PCAP_D, "10122018-104Mega-anomaly.pcapng")}

CSV = {"1": join(CSV_D, "mega104-17-12-18.csv"),
       "2": join(CSV_D, "10122018-104Mega.csv"),
       "3": join(CSV_D, "10122018-104Mega-anomaly.csv")}
nest_asyncio.apply()

In [40]:
def parse(num=1):
    pcap_file = PCAP[str(num)]
    print(f"Reading from {pcap_file}")
    packets = pyshark.FileCapture(pcap_file)

    parsed_data = [("asdu_len", "io_type", "type_id", "src", "dst", "interval", "relative_time_stamp")]
    
    previous = 0
    first_time_stamp = packets[0].sniff_time
    relative_time = 0
    interval = 0
    hosts = {}
    next_index = 0
    for p in packets:
        if "iec60870_104" not in [l.layer_name for l in p.layers]:
            continue
        
        # Count time from the previous IEC 104 packet
        if previous != 0:
            interval = float((p.sniff_time - previous).total_seconds())
            relative_time = (p.sniff_time - first_time_stamp).total_seconds()
        if p.ip.src not in hosts.keys():
            hosts[p.ip.src] = next_index
            next_index += 1
        if p.ip.dst not in hosts.keys():
            hosts[p.ip.dst] = next_index
            next_index += 1
        
        src = hosts[p.ip.src]
        dst = hosts[p.ip.dst]
        
        previous = p.sniff_time
        # Extract only one 'representative' for the current package
        asdu_layer = p.get_multiple_layers("iec60870_asdu")
        if len(asdu_layer) == 0:
            continue
        asdu_layer = asdu_layer[0]

        iec_header_layer = p.get_multiple_layers("iec60870_104")
        # Aggregate values if more then one header is present in the packet
        iec_header = iec_header_layer[0]
        try:
            iec_header.apdulen = int(iec_header.apdulen)
        except AttributeError:
            # Not all APDU has valid apdulen attribute. Those packets in
            # Wireshark displayed as a byte sequence, so this packet can
            # be parsed
            print("Error in converting the value in packet")
            print_exc()
            print(p)
            continue

        if len(iec_header_layer) != 1:
            for entry in iec_header_layer[1:]:
                iec_header.apdulen += int(entry.apdulen)

        try:
            if asdu_layer:
                parsed_data.append((iec_header.apdulen, asdu_layer.ioa, asdu_layer.typeid, src, dst, interval, relative_time))
        except:
            # Ignoring error if data can't be appended for some reasons.
            print("Error in parsing the packet")
            print_exc()
            print(p)

    with open(CSV[str(num)], "w") as f:
        writer = csv.writer(f)
        writer.writerows(parsed_data)

    print(f"CSV file is stored into {CSV[str(num)]}")


In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(parse, [1, 2, 3])

In [90]:
def predict(model, nu, dataset):
    model = join(MODELS_D, f"one-class-svm-{model}-nu-{nu}.joblib")

    svm = load(model)
    data = pd.read_csv(dataset).drop(columns=["relative_time_stamp"])

    prediction = svm.predict(data)
    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size

    print("-"*(len(f"Total number of samples: {size}") + 2))
    print(f"Datset: {dataset}")
    print(f"Total number of samples: {size}")
    print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
    print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
    print("-"*(len(f"Total number of samples: {size}") + 2))



In [75]:
def create_model(num=1, nu=0.018):
    iec104 = pd.read_csv(CSV[str(num)], header=0, skipinitialspace=True)

    iec104 = iec104.drop(columns=["relative_time_stamp"])
    x_train, x_test = train_test_split(iec104, train_size=2/3, test_size=1/3,
                                    shuffle=False, random_state=0)
    one_class_svm = OneClassSVM(nu=nu, kernel = 'rbf', gamma = 0.1).fit(x_train)
    dump(one_class_svm, f"{DATA_DIR}/models/one-class-svm-{num}-nu-{nu:.3f}.joblib")
    prediction = one_class_svm.predict(x_test)

    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size
    
    print("-"*(len(f"Datset: {CSV[str(num)]}") + 2))
    print(f"Datset: {CSV[str(num)]}")
    print(f"Nu is: {nu:.4f}")
    print(f"Total number of samples: {size}")
    print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
    print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
    print("-"*(len(f"Datset: {CSV[str(num)]}") + 2))

    return [nu, perc_anom, 1 - perc_anom]

In [76]:
# One-shot training
nu = 0.017
create_model(1, nu)
create_model(2, nu)

---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0170
Total number of samples: 12554
Normal: 12261 (97.67%)
Anomalies: 293 (2.33%)
---------------------------------------
---------------------------------------
Datset: data/csv/10122018-104Mega.csv
Nu is: 0.0170
Total number of samples: 22126
Normal: 21712 (98.13%)
Anomalies: 414 (1.87%)
---------------------------------------


[0.017, 0.018711018711018712, 0.9812889812889813]

In [78]:
# Check different Nu parameter for the training of the data
nu = 0.002
result_pd_1 = []
result_pd_2 = []
nus = np.arange(0.013, 0.031, 0.001)

def train(nu):
    entry = (nu, *create_model(1, nu=nu)[1:])
    result_pd_1.append(entry)
    entry = (nu, *create_model(2, nu=nu)[1:])
    result_pd_2.append(entry)

with concurrent.futures.ThreadPoolExecutor(max_workers=len(nus)) as executor:
        executor.map(train, nus)


df_1 = pd.DataFrame(result_pd_1, columns=["nu", "anomalies_1", "ok_1"])
df_2 = pd.DataFrame(result_pd_2, columns=["nu", "anomalies_2", "ok_2"])
df_1.to_csv(join(CSV_D, "pandas-df-1.csv"))
df_2.to_csv(join(CSV_D, "pandas-df-2.csv"))
df_1_sort = df_1.sort_values(by=['ok_1'], ascending=False).reset_index(drop=True)
df_2_sort = df_2.sort_values(by=["ok_2"], ascending=False).reset_index(drop=True)

---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0130
Total number of samples: 12554
Normal: 12157 (96.84%)
Anomalies: 397 (3.16%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0150
Total number of samples: 12554
Normal: 12149 (96.77%)
Anomalies: 405 (3.23%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0140
Total number of samples: 12554
Normal: 12231 (97.43%)
Anomalies: 323 (2.57%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0160
Total number of samples: 12554
Normal: 12338 (98.28%)
Anomalies: 216 (1.72%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0170
Total number of samples: 12554
Normal: 12261 (97.67%)
Anomalies: 293 (2.

In [79]:
df_1_sort = df_1.sort_values(by=['ok_1'], ascending=False).reset_index(drop=True)
df_2_sort = df_2.sort_values(by=["ok_2"], ascending=False).reset_index(drop=True)

ok_1 = df_1_sort.loc[(df_1_sort["ok_1"] > 0.97)]
ok_2 = df_2_sort.loc[(df_2_sort["ok_2"] > 0.97)]

merged = pd.merge(left=ok_1, right=ok_2, how="right", left_on="nu", right_on="nu")
print(merged)


      nu  anomalies_1      ok_1  anomalies_2      ok_2
0  0.013          NaN       NaN     0.008090  0.991910
1  0.021     0.023260  0.976740     0.016632  0.983368
2  0.017     0.023339  0.976661     0.018711  0.981289


In [154]:
data = pd.read_csv(CSV["2"])
data["relative_time_stamp"] = pd.to_datetime(data["relative_time_stamp"], format="%f")
# data = data.set_index("relative_time_stamp")
print(data)

       asdu_len  io_type  type_id  src  dst  interval  \
0            17    65537      122    0    1  0.000000   
1            19    65537      120    1    0  0.000595   
2            17    65537      122    0    1  0.000119   
3            20    65537      121    1    0  0.000338   
4            17    65537      122    0    1  0.062174   
...         ...      ...      ...  ...  ...       ...   
66372       533    65537      125    1    0  0.119856   
66373        25       31       36    1    0  0.000348   
66374        17    65537      124    0    1  0.000470   
66375        18    65537      123    1    0  0.000341   
66376        17    65537      124    0    1  0.000164   

             relative_time_stamp  
0     1900-01-01 00:00:00.000000  
1     1900-01-01 00:00:00.000000  
2     1900-01-01 00:00:00.000000  
3     1900-01-01 00:00:00.000000  
4     1900-01-01 00:00:00.000000  
...                          ...  
66372 1900-01-01 00:00:00.176010  
66373 1900-01-01 00:00:00.176010  


In [161]:
# split to intervals for 5 minutes
for i, frame in enumerate(data.groupby([data['relative_time_stamp'],pd.Grouper(key="relative_time_stamp",freq='Min')])):
    frame[1].to_csv(join(CSV_D, "intervals", f"frame-{i}.csv"))

In [98]:
# predict(1, 0.021)
predict(2, 0.021, CSV["2"])
predict(2, 0.021, CSV["3"])

--------------------------------
Datset: data/csv/10122018-104Mega.csv
Total number of samples: 66377
Normal: 64846 (97.69%)
Anomalies: 1531 (2.31%)
--------------------------------
--------------------------------
Datset: data/csv/10122018-104Mega-anomaly.csv
Total number of samples: 66377
Normal: 64826 (97.66%)
Anomalies: 1551 (2.34%)
--------------------------------
