In [1]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pandas as pd
from joblib import dump
from os.path import join, basename
import numpy as np
import pyshark
from traceback import print_exc
import csv
import concurrent.futures
from joblib import load
import nest_asyncio
from os import makedirs, listdir

DATA_DIR = join("data")
PCAP_D = join(DATA_DIR, "pcap")
CSV_D = join(DATA_DIR, "csv")
MODELS_D = join(DATA_DIR, "models")
INTERVALS_D = join(CSV_D, "intervals")
ANOMALIES_D = join(CSV_D, "anomalies")

for d in (DATA_DIR, PCAP_D, CSV_D, MODELS_D, INTERVALS_D, ANOMALIES_D):
        makedirs(d, exist_ok=True)

PCAP = {"1": join(PCAP_D, "mega104-17-12-18.pcapng"),
        "2": join(PCAP_D, "10122018-104Mega.pcapng"),
        "3": join(PCAP_D, "10122018-104Mega-anomaly.pcapng")}

CSV = {"1": join(CSV_D, "mega104-17-12-18.csv"),
       "2": join(CSV_D, "10122018-104Mega.csv"),
       "3": join(CSV_D, "10122018-104Mega-anomaly.csv")}

nest_asyncio.apply()

In [2]:
def parse(num=1):
    pcap_file = PCAP[str(num)]
    print(f"Reading from {pcap_file}")
    packets = pyshark.FileCapture(pcap_file)

    parsed_data = [("asdu_len", "io_type", "type_id", "src", "dst", "interval", "relative_time_stamp")]
    
    previous = 0
    first_time_stamp = packets[0].sniff_time
    relative_time = 0
    interval = 0
    hosts = {}
    next_index = 0
    for p in packets:
        if "iec60870_104" not in [l.layer_name for l in p.layers]:
            continue
        
        # Count time from the previous IEC 104 packet
        if previous != 0:
            interval = float((p.sniff_time - previous).total_seconds())
            relative_time = (p.sniff_time - first_time_stamp).total_seconds()
        if p.ip.src not in hosts.keys():
            hosts[p.ip.src] = next_index
            next_index += 1
        if p.ip.dst not in hosts.keys():
            hosts[p.ip.dst] = next_index
            next_index += 1
        
        src = hosts[p.ip.src]
        dst = hosts[p.ip.dst]
        
        previous = p.sniff_time
        # Extract only one 'representative' for the current package
        asdu_layer = p.get_multiple_layers("iec60870_asdu")
        if len(asdu_layer) == 0:
            continue
        asdu_layer = asdu_layer[0]

        iec_header_layer = p.get_multiple_layers("iec60870_104")
        # Aggregate values if more then one header is present in the packet
        iec_header = iec_header_layer[0]
        try:
            iec_header.apdulen = int(iec_header.apdulen)
        except AttributeError:
            # Not all APDU has valid apdulen attribute. Those packets in
            # Wireshark displayed as a byte sequence, so this packet can
            # be parsed
            print("Error in converting the value in packet")
            print_exc()
            print(p)
            continue

        if len(iec_header_layer) != 1:
            for entry in iec_header_layer[1:]:
                iec_header.apdulen += int(entry.apdulen)

        try:
            if asdu_layer:
                parsed_data.append((iec_header.apdulen, asdu_layer.ioa, asdu_layer.typeid, src, dst, interval, relative_time))
        except:
            # Ignoring error if data can't be appended for some reasons.
            print("Error in parsing the packet")
            print_exc()
            print(p)

    with open(CSV[str(num)], "w") as f:
        writer = csv.writer(f)
        writer.writerows(parsed_data)

    print(f"CSV file is stored into {CSV[str(num)]}")


In [47]:
# with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
#         executor.map(parse, [1, 2])
parse(1)
parse(2)

Reading from data/pcap/mega104-17-12-18.pcapng


Exception ignored in: <function Capture.__del__ at 0x7fdc2ca0f790>
Traceback (most recent call last):
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/pyshark/capture/capture.py", line 445, in __del__
    self.close()
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/pyshark/capture/capture.py", line 436, in close
    self.eventloop.run_until_complete(self.close_async())
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/nest_asyncio.py", line 81, in run_until_complete
    return f.result()
  File "/usr/lib/python3.8/asyncio/futures.py", line 178, in result
    raise self._exception
  File "/usr/lib/python3.8/asyncio/tasks.py", line 280, in __step
    result = coro.send(None)
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/pyshark/capture/capture.py", line 440, in close_async
    awa

KeyboardInterrupt: 

In [3]:
def predict(model, nu, dataset, out=True):
    model = join(MODELS_D, f"one-class-svm-{model}-nu-{nu}.joblib")

    svm = load(model)
    data = pd.read_csv(dataset).drop(columns=["relative_time_stamp"])

    prediction = svm.predict(data)
    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size

    if out:
        print("-"*(len(f"Total number of samples: {size}") + 2))
        print(f"Datset: {dataset}")
        print(f"Total number of samples: {size}")
        print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
        print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
        print("-"*(len(f"Total number of samples: {size}") + 2))
    return prediction



In [4]:
def create_model(num=1, nu=0.018):
    iec104 = pd.read_csv(CSV[str(num)], header=0, skipinitialspace=True)

    iec104 = iec104.drop(columns=["relative_time_stamp"])
    x_train, x_test = train_test_split(iec104, train_size=2/3, test_size=1/3,
                                    shuffle=False, random_state=0)
    one_class_svm = OneClassSVM(nu=nu, kernel = 'rbf', gamma = 0.1).fit(x_train)
    dump(one_class_svm, f"{DATA_DIR}/models/one-class-svm-{num}-nu-{nu:.3f}.joblib")
    prediction = one_class_svm.predict(x_test)

    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size
    
    print("-"*(len(f"Datset: {CSV[str(num)]}") + 2))
    print(f"Datset: {CSV[str(num)]}")
    print(f"Nu is: {nu:.4f}")
    print(f"Total number of samples: {size}")
    print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
    print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
    print("-"*(len(f"Datset: {CSV[str(num)]}") + 2))

    return [nu, perc_anom, 1 - perc_anom]

In [11]:
# One-shot training
nu = 0.017
create_model(1, nu)
create_model(2, nu)

---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0170
Total number of samples: 12554
Normal: 12261 (97.67%)
Anomalies: 293 (2.33%)
---------------------------------------
---------------------------------------
Datset: data/csv/10122018-104Mega.csv
Nu is: 0.0170
Total number of samples: 22126
Normal: 21712 (98.13%)
Anomalies: 414 (1.87%)
---------------------------------------


[0.017, 0.018711018711018712, 0.9812889812889813]

In [97]:
# Check different Nu parameter for the training of the data
nu = 0.002
result_pd_1 = []
result_pd_2 = []
nus = np.arange(0.013, 0.028, 0.001)

def train(nu):
    entry = (nu, *create_model(1, nu=nu)[1:])
    result_pd_1.append(entry)
    entry = (nu, *create_model(2, nu=nu)[1:])
    result_pd_2.append(entry)

with concurrent.futures.ThreadPoolExecutor(max_workers=len(nus)) as executor:
        executor.map(train, nus)


df_1 = pd.DataFrame(result_pd_1, columns=["nu", "anomalies_1", "ok_1"])
df_2 = pd.DataFrame(result_pd_2, columns=["nu", "anomalies_2", "ok_2"])
df_1.to_csv(join(CSV_D, "pandas-df-1.csv"))
df_2.to_csv(join(CSV_D, "pandas-df-2.csv"))
df_1_sort = df_1.sort_values(by=['ok_1'], ascending=False).reset_index(drop=True)
df_2_sort = df_2.sort_values(by=["ok_2"], ascending=False).reset_index(drop=True)

ok_1 = df_1_sort.loc[(df_1_sort["ok_1"] > 0.97)]
ok_2 = df_2_sort.loc[(df_2_sort["ok_2"] > 0.97)]

merged = pd.merge(left=ok_1, right=ok_2, how="right", left_on="nu", right_on="nu")
print(merged)

---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0150
Total number of samples: 12554
Normal: 12149 (96.77%)
Anomalies: 405 (3.23%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0130
Total number of samples: 12554
Normal: 12157 (96.84%)
Anomalies: 397 (3.16%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0140
Total number of samples: 12554
Normal: 12231 (97.43%)
Anomalies: 323 (2.57%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0160
Total number of samples: 12554
Normal: 12338 (98.28%)
Anomalies: 216 (1.72%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0270
Total number of samples: 12554
Normal: 12125 (96.58%)
Anomalies: 429 (3.

In [5]:
model_num = str(2)
nu = 0.017
def get_interval(i, type_="o"):
    return join(INTERVALS_D, f"frame-{model_num}-{i}.csv") if type_ == "o" else join(ANOMALIES_D, f"frame-{model_num}-{i}.csv") 

    

In [14]:
data = pd.read_csv(CSV[model_num])
data["relative_time_stamp"] = pd.to_datetime(data["relative_time_stamp"], unit='s',)


In [148]:
# split to intervals for 5 minutes
for i, frame in enumerate(data.groupby(pd.Grouper(key="relative_time_stamp",freq='5min'))):
    frame[1].to_csv(join(INTERVALS_D, f"frame-{model_num}-{i}.csv"), index=False, date_format="%M:%S.%f")

In [6]:
# create anomalies
def create_anomalies(i):
    frame_name = get_interval(i, type_="o")
    frame = pd.read_csv(frame_name)
    row_num = frame.shape[0]
    # generate 15 random indexes to change size of data
    min_size = frame["asdu_len"].min()
    max_size = frame["asdu_len"].max()

    indexes = np.random.randint(0, row_num, size=20)
    values = np.random.randint(min_size, max_size, 20)

    frame.loc[indexes, ["asdu_len"]] = values
    # take range of 25 items to imitate DOS  
    index = np.random.randint(0, row_num-10)
    range_ = pd.RangeIndex(index, index + 10)
    
    src = frame.loc[range_, "src"]
    frame.loc[range_, "src"] = frame.loc[range_, "dst"]
    frame.loc[range_, "dst"] = src
    frame.to_csv(join(ANOMALIES_D, basename(frame_name)), index=False)
    indexes = np.append(indexes, range_)
    # indexes.append()

    return frame, (indexes, values), index


In [None]:
# split to intervals for 5 minutes
for i, frame in enumerate(data.groupby(pd.Grouper(key="relative_time_stamp",freq='5min'))):
    frame[1].to_csv(join(INTERVALS_D, f"frame-{model_num}-{i}.csv"), index=False, date_format="%M:%S.%f")

In [272]:
_, (ind, vals), i = create_anomalies(8)

In [8]:
frame_num = 3
frame_ok = get_interval(frame_num, type_='o')

res = predict(model_num, 0.017, frame_ok)
anoms_ok = [i for i, n in enumerate(res) if n == -1]
ok = [i for i, n in enumerate(res) if n == 1]
all_data = pd.read_csv(frame_ok)

# print(all_data.iloc[anoms])
# print(all_data.iloc[ok])
data_copy = all_data.copy()

------------------------------
Datset: data/csv/intervals/frame-2-3.csv
Total number of samples: 981
Normal: 944 (96.23%)
Anomalies: 37 (3.77%)
------------------------------


In [None]:
# split to intervals for 5 minutes
for i, frame in enumerate(data.groupby(pd.Grouper(key="relative_time_stamp",freq='5min'))):
    frame[1].to_csv(join(INTERVALS_D, f"frame-{model_num}-{i}.csv"), index=False, date_format="%M:%S.%f")

In [9]:
frame_num = 3
_, (ind, vals), i = create_anomalies(frame_num)
frame_anom = get_interval(frame_num, type_='a')

res = predict(num, 0.017, frame_anom)
anoms_an = [i for i, n in enumerate(res) if n == -1]
all_data = pd.read_csv(frame_anom)

anoms_real = sorted([i for i in anoms_an if i not in anoms_ok])
print(f"Created {len(ind)}: {sorted(ind)}")
print(f"Detected {len(anoms_real)}: {anoms_real}")
print(f"{len(ind)/len(anoms_real):.02f}%")

------------------------------
Datset: data/csv/anomalies/frame-2-3.csv
Total number of samples: 981
Normal: 915 (93.27%)
Anomalies: 66 (6.73%)
------------------------------
Created 30: [17, 24, 130, 171, 195, 216, 248, 256, 428, 442, 525, 556, 559, 596, 639, 643, 650, 666, 689, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 835]
Detected 29: [17, 24, 130, 171, 195, 216, 248, 256, 428, 442, 525, 556, 559, 596, 639, 643, 650, 666, 689, 794, 795, 796, 797, 798, 799, 800, 801, 802, 835]
1.03%


### Accuracy of original datasets

In [7]:
data_1 = pd.read_csv(CSV["1"])
data_1["relative_time_stamp"] = pd.to_datetime(data_1["relative_time_stamp"], unit='s',)
real_labels = np.ones(data_1.shape[0], dtype=int)
gen_labels = predict("1", 0.021, CSV["1"])
acc = accuracy_score(real_labels, gen_labels)
print(acc)

--------------------------------
Datset: data/csv/mega104-17-12-18.csv
Total number of samples: 37660
Normal: 36679 (97.40%)
Anomalies: 981 (2.60%)
--------------------------------
0.973951141795008


In [43]:
data_2 = pd.read_csv(CSV["2"])
data_2["relative_time_stamp"] = pd.to_datetime(data_2["relative_time_stamp"], unit='s',)
real_labels = np.ones(data_2.shape[0], dtype=int)
gen_labels = predict("2", nu, CSV["2"])
acc = accuracy_score(real_labels, gen_labels)
print(acc)

--------------------------------
Datset: data/csv/10122018-104Mega.csv
Total number of samples: 66377
Normal: 64714 (97.49%)
Anomalies: 1663 (2.51%)
--------------------------------
0.9749461409825693


In [None]:
".s".sta

In [10]:
model_num = "1"
frames = [f for f in listdir(INTERVALS_D) if f"frame-{model_num}" in f]
result_acc = []
for f in frames:
    df = pd.read_csv(join(INTERVALS_D, f))
    real_labels = np.ones(df.shape[0], dtype=int)
    gen_labels = predict(model_num, 0.017, join(INTERVALS_D, f))
    result_acc.append((join(INTERVALS_D, f), accuracy_score(real_labels, gen_labels)))
result_acc = pd.DataFrame(result_acc, columns=["dataframe", "accuracy"])


-----------------------------
Datset: data/csv/intervals/frame-1-451.csv
Total number of samples: 28
Normal: 27 (96.43%)
Anomalies: 1 (3.57%)
-----------------------------
-----------------------------
Datset: data/csv/intervals/frame-1-536.csv
Total number of samples: 46
Normal: 45 (97.83%)
Anomalies: 1 (2.17%)
-----------------------------
-----------------------------
Datset: data/csv/intervals/frame-1-306.csv
Total number of samples: 51
Normal: 49 (96.08%)
Anomalies: 2 (3.92%)
-----------------------------
-----------------------------
Datset: data/csv/intervals/frame-1-340.csv
Total number of samples: 50
Normal: 50 (100.00%)
Anomalies: 0 (0.00%)
-----------------------------
-----------------------------
Datset: data/csv/intervals/frame-1-236.csv
Total number of samples: 56
Normal: 55 (98.21%)
Anomalies: 1 (1.79%)
-----------------------------
-----------------------------
Datset: data/csv/intervals/frame-1-623.csv
Total number of samples: 52
Normal: 52 (100.00%)
Anomalies: 0 (0.0

In [11]:
max_acc = result_acc[result_acc.accuracy == result_acc.accuracy.max()]
print(f"Total count of intervals: {result_acc.shape[0]}")
print(f"Minimal accuracy in row:\n{result_acc[result_acc.accuracy == result_acc.accuracy.min()]}")
print(f"Number of 100% accuracy: {max_acc.shape[0]} ({(max_acc.shape[0]/result_acc.shape[0])*100:.2f}%)")
print(f"Mean of accuracy: {result_acc.accuracy.mean()}")
# print(result_acc[result_acc.accuracy > result_acc.accuracy.mean()].shape[0])


Total count of intervals: 816
Minimal accuracy in row:
                              dataframe  accuracy
540  data/csv/intervals/frame-1-697.csv   0.83871
Number of 100% accuracy: 266 (32.60%)
Mean of accuracy: 0.9727116890097145
