In [206]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
import pandas as pd
from joblib import dump
from os.path import join, basename
import numpy as np
import pyshark
from traceback import print_exc
import csv
import concurrent.futures
from joblib import load
import nest_asyncio
from os import makedirs

DATA_DIR = join("data")
PCAP_D = join(DATA_DIR, "pcap")
CSV_D = join(DATA_DIR, "csv")
MODELS_D = join(DATA_DIR, "models")
INTERVALS_D = join(CSV_D, "intervals")
ANOMALIES_D = join(CSV_D, "anomalies")

for d in (DATA_DIR, PCAP_D, CSV_D, MODELS_D, INTERVALS_D, ANOMALIES_D):
        makedirs(d, exist_ok=True)

PCAP = {"1": join(PCAP_D, "mega104-17-12-18.pcapng"),
        "2": join(PCAP_D, "10122018-104Mega.pcapng"),
        "3": join(PCAP_D, "10122018-104Mega-anomaly.pcapng")}

CSV = {"1": join(CSV_D, "mega104-17-12-18.csv"),
       "2": join(CSV_D, "10122018-104Mega.csv"),
       "3": join(CSV_D, "10122018-104Mega-anomaly.csv")}

nest_asyncio.apply()

In [44]:
def parse(num=1):
    pcap_file = PCAP[str(num)]
    print(f"Reading from {pcap_file}")
    packets = pyshark.FileCapture(pcap_file)

    parsed_data = [("asdu_len", "io_type", "type_id", "src", "dst", "interval", "relative_time_stamp")]
    
    previous = 0
    first_time_stamp = packets[0].sniff_time
    relative_time = 0
    interval = 0
    hosts = {}
    next_index = 0
    for p in packets:
        if "iec60870_104" not in [l.layer_name for l in p.layers]:
            continue
        
        # Count time from the previous IEC 104 packet
        if previous != 0:
            interval = float((p.sniff_time - previous).total_seconds())
            relative_time = (p.sniff_time - first_time_stamp).total_seconds()
        if p.ip.src not in hosts.keys():
            hosts[p.ip.src] = next_index
            next_index += 1
        if p.ip.dst not in hosts.keys():
            hosts[p.ip.dst] = next_index
            next_index += 1
        
        src = hosts[p.ip.src]
        dst = hosts[p.ip.dst]
        
        previous = p.sniff_time
        # Extract only one 'representative' for the current package
        asdu_layer = p.get_multiple_layers("iec60870_asdu")
        if len(asdu_layer) == 0:
            continue
        asdu_layer = asdu_layer[0]

        iec_header_layer = p.get_multiple_layers("iec60870_104")
        # Aggregate values if more then one header is present in the packet
        iec_header = iec_header_layer[0]
        try:
            iec_header.apdulen = int(iec_header.apdulen)
        except AttributeError:
            # Not all APDU has valid apdulen attribute. Those packets in
            # Wireshark displayed as a byte sequence, so this packet can
            # be parsed
            print("Error in converting the value in packet")
            print_exc()
            print(p)
            continue

        if len(iec_header_layer) != 1:
            for entry in iec_header_layer[1:]:
                iec_header.apdulen += int(entry.apdulen)

        try:
            if asdu_layer:
                parsed_data.append((iec_header.apdulen, asdu_layer.ioa, asdu_layer.typeid, src, dst, interval, relative_time))
        except:
            # Ignoring error if data can't be appended for some reasons.
            print("Error in parsing the packet")
            print_exc()
            print(p)

    with open(CSV[str(num)], "w") as f:
        writer = csv.writer(f)
        writer.writerows(parsed_data)

    print(f"CSV file is stored into {CSV[str(num)]}")


In [47]:
# with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
#         executor.map(parse, [1, 2])
parse(1)
parse(2)

Reading from data/pcap/mega104-17-12-18.pcapng


Exception ignored in: <function Capture.__del__ at 0x7fdc2ca0f790>
Traceback (most recent call last):
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/pyshark/capture/capture.py", line 445, in __del__
    self.close()
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/pyshark/capture/capture.py", line 436, in close
    self.eventloop.run_until_complete(self.close_async())
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/nest_asyncio.py", line 81, in run_until_complete
    return f.result()
  File "/usr/lib/python3.8/asyncio/futures.py", line 178, in result
    raise self._exception
  File "/usr/lib/python3.8/asyncio/tasks.py", line 280, in __step
    result = coro.send(None)
  File "/home/xyadlo00/studies/FIT/MITAI/1-rocnik/letni/PDS/proj/pds-env/lib/python3.8/site-packages/pyshark/capture/capture.py", line 440, in close_async
    awa

KeyboardInterrupt: 

In [32]:
def predict(model, nu, dataset):
    model = join(MODELS_D, f"one-class-svm-{model}-nu-{nu}.joblib")

    svm = load(model)
    data = pd.read_csv(dataset).drop(columns=["relative_time_stamp"])

    prediction = svm.predict(data)
    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size

    print("-"*(len(f"Total number of samples: {size}") + 2))
    print(f"Datset: {dataset}")
    print(f"Total number of samples: {size}")
    print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
    print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
    print("-"*(len(f"Total number of samples: {size}") + 2))
    return prediction



In [9]:
def create_model(num=1, nu=0.018):
    iec104 = pd.read_csv(CSV[str(num)], header=0, skipinitialspace=True)

    iec104 = iec104.drop(columns=["relative_time_stamp"])
    x_train, x_test = train_test_split(iec104, train_size=2/3, test_size=1/3,
                                    shuffle=False, random_state=0)
    one_class_svm = OneClassSVM(nu=nu, kernel = 'rbf', gamma = 0.1).fit(x_train)
    dump(one_class_svm, f"{DATA_DIR}/models/one-class-svm-{num}-nu-{nu:.3f}.joblib")
    prediction = one_class_svm.predict(x_test)

    size = len(prediction)
    t = [i for i in prediction if i == -1]
    anomalies = len(t)
    t = [i for i in prediction if i == 1]
    ok = len(t)
    perc_anom = anomalies/size
    
    print("-"*(len(f"Datset: {CSV[str(num)]}") + 2))
    print(f"Datset: {CSV[str(num)]}")
    print(f"Nu is: {nu:.4f}")
    print(f"Total number of samples: {size}")
    print(f"Normal: {ok} ({100*(1-perc_anom):.2f}%)")
    print(f"Anomalies: {anomalies} ({100*perc_anom:.2f}%)")
    print("-"*(len(f"Datset: {CSV[str(num)]}") + 2))

    return [nu, perc_anom, 1 - perc_anom]

In [11]:
# One-shot training
nu = 0.017
create_model(1, nu)
create_model(2, nu)

---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0170
Total number of samples: 12554
Normal: 12261 (97.67%)
Anomalies: 293 (2.33%)
---------------------------------------
---------------------------------------
Datset: data/csv/10122018-104Mega.csv
Nu is: 0.0170
Total number of samples: 22126
Normal: 21712 (98.13%)
Anomalies: 414 (1.87%)
---------------------------------------


[0.017, 0.018711018711018712, 0.9812889812889813]

In [97]:
# Check different Nu parameter for the training of the data
nu = 0.002
result_pd_1 = []
result_pd_2 = []
nus = np.arange(0.013, 0.028, 0.001)

def train(nu):
    entry = (nu, *create_model(1, nu=nu)[1:])
    result_pd_1.append(entry)
    entry = (nu, *create_model(2, nu=nu)[1:])
    result_pd_2.append(entry)

with concurrent.futures.ThreadPoolExecutor(max_workers=len(nus)) as executor:
        executor.map(train, nus)


df_1 = pd.DataFrame(result_pd_1, columns=["nu", "anomalies_1", "ok_1"])
df_2 = pd.DataFrame(result_pd_2, columns=["nu", "anomalies_2", "ok_2"])
df_1.to_csv(join(CSV_D, "pandas-df-1.csv"))
df_2.to_csv(join(CSV_D, "pandas-df-2.csv"))
df_1_sort = df_1.sort_values(by=['ok_1'], ascending=False).reset_index(drop=True)
df_2_sort = df_2.sort_values(by=["ok_2"], ascending=False).reset_index(drop=True)

ok_1 = df_1_sort.loc[(df_1_sort["ok_1"] > 0.97)]
ok_2 = df_2_sort.loc[(df_2_sort["ok_2"] > 0.97)]

merged = pd.merge(left=ok_1, right=ok_2, how="right", left_on="nu", right_on="nu")
print(merged)

---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0150
Total number of samples: 12554
Normal: 12149 (96.77%)
Anomalies: 405 (3.23%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0130
Total number of samples: 12554
Normal: 12157 (96.84%)
Anomalies: 397 (3.16%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0140
Total number of samples: 12554
Normal: 12231 (97.43%)
Anomalies: 323 (2.57%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0160
Total number of samples: 12554
Normal: 12338 (98.28%)
Anomalies: 216 (1.72%)
---------------------------------------
---------------------------------------
Datset: data/csv/mega104-17-12-18.csv
Nu is: 0.0270
Total number of samples: 12554
Normal: 12125 (96.58%)
Anomalies: 429 (3.

In [164]:
num = str(2)
def get_interval(i, type_="o"):
    return join(INTERVALS_D, f"frame-{num}-{i}.csv") if type_ == "o" else join(ANOMALIES_D, f"frame-{num}-{i}.csv") 

    

In [147]:
data = pd.read_csv(CSV[num])
data["relative_time_stamp"] = pd.to_datetime(data["relative_time_stamp"], unit='s',)
print(data)

       asdu_len  io_type  type_id  src  dst  interval  \
0            46       67       31    1    0  0.202736   
1            25        2       36    1    0  3.285928   
2            25        2       36    1    0  5.799724   
3            25        2       36    1    0  2.992513   
4            25        2       36    1    0  4.799476   
...         ...      ...      ...  ...  ...       ...   
37655        25        2       36    1    0  4.663499   
37656        25        2       36    1    0  5.069004   
37657        25        2       36    1    0  8.402325   
37658        25        2       36    1    0  0.779002   
37659        25        2       36    1    0  7.800874   

             relative_time_stamp  
0     1970-01-01 00:00:08.191193  
1     1970-01-01 00:00:11.477121  
2     1970-01-01 00:00:17.276845  
3     1970-01-01 00:00:21.276420  
4     1970-01-01 00:00:26.075896  
...                          ...  
37655 1970-01-03 19:54:37.419562  
37656 1970-01-03 19:54:47.555711  


In [148]:
# split to intervals for 5 minutes
for i, frame in enumerate(data.groupby(pd.Grouper(key="relative_time_stamp",freq='5min'))):
    frame[1].to_csv(join(INTERVALS_D, f"frame-{num}-{i}.csv"), index=False, date_format="%M:%S.%f")

In [286]:
# create anomalies
def create_anomalies(i):
    frame_name = get_interval(i, type_="o")
    frame = pd.read_csv(frame_name)
    row_num = frame.shape[0]
    # generate 15 random indexes to change size of data
    min_size = frame["asdu_len"].min()
    max_size = frame["asdu_len"].max()

    indexes = np.random.randint(0, row_num, size=20)
    values = np.random.randint(min_size, max_size, 20)

    frame.loc[indexes, ["asdu_len"]] = values
    # take range of 25 items to imitate DOS  
    index = np.random.randint(0, row_num-10)
    range_ = pd.RangeIndex(index, index + 10)
    
    src = frame.loc[range_, "src"]
    frame.loc[range_, "src"] = frame.loc[range_, "dst"]
    frame.loc[range_, "dst"] = src
    frame.to_csv(join(ANOMALIES_D, basename(frame_name)), index=False)
    indexes = np.append(indexes, range_)
    # indexes.append()

    return frame, (indexes, values), index


In [None]:
# split to intervals for 5 minutes
for i, frame in enumerate(data.groupby(pd.Grouper(key="relative_time_stamp",freq='5min'))):
    frame[1].to_csv(join(INTERVALS_D, f"frame-{num}-{i}.csv"), index=False, date_format="%M:%S.%f")

In [272]:
_, (ind, vals), i = create_anomalies(8)

In [284]:
frame_num = 3
frame_ok = get_interval(frame_num, type_='o')

res = predict(num, 0.017, frame_ok)
anoms_ok = [i for i, n in enumerate(res) if n == -1]
ok = [i for i, n in enumerate(res) if n == 1]
all_data = pd.read_csv(frame_ok)

# print(all_data.iloc[anoms])
# print(all_data.iloc[ok])
data_copy = all_data.copy()

------------------------------
Datset: data/csv/intervals/frame-2-3.csv
Total number of samples: 981
Normal: 944 (96.23%)
Anomalies: 37 (3.77%)
------------------------------


In [None]:
# split to intervals for 5 minutes
for i, frame in enumerate(data.groupby(pd.Grouper(key="relative_time_stamp",freq='5min'))):
    frame[1].to_csv(join(INTERVALS_D, f"frame-{num}-{i}.csv"), index=False, date_format="%M:%S.%f")

In [285]:
frame_num = 3
_, (ind, vals), i = create_anomalies(frame_num)
frame_anom = get_interval(frame_num, type_='a')

res = predict(num, 0.017, frame_anom)
anoms_an = [i for i, n in enumerate(res) if n == -1]
all_data = pd.read_csv(frame_anom)

anoms_real = sorted([i for i in anoms_an if i not in anoms_ok])
print(f"Created {len(ind)}: {sorted(ind)}")
print(f"Detected {len(anoms_real)}: {anoms_real}")
print(f"{len(ind)/len(anoms_real):.02f}%")

------------------------------
Datset: data/csv/anomalies/frame-2-3.csv
Total number of samples: 981
Normal: 918 (93.58%)
Anomalies: 63 (6.42%)
------------------------------
Created 30: [68, 119, 123, 149, 175, 302, 337, 379, 532, 587, 685, 694, 728, 729, 730, 731, 732, 733, 734, 735, 736, 736, 737, 747, 753, 775, 790, 803, 809, 869]
Detected 26: [68, 119, 123, 149, 175, 302, 337, 379, 532, 587, 685, 694, 728, 730, 731, 732, 733, 734, 735, 736, 737, 747, 753, 775, 790, 809]
1.15%


In [204]:
min_size = data_copy["asdu_len"].min()
max_size = data_copy["asdu_len"].max()

indexes = np.random.randint(0, data_copy.shape[0], 10)
values = np.random.randint(min_size, max_size, 10)

print(indexes)
print(values)
data_copy.loc[indexes, ["asdu_len"]] = values
print(all_data.iloc[indexes])
print(data_copy.iloc[indexes])


[289 428 401 830 785   1 759 580 185  16]
[321  41  62 460 413 109 425 286 110 187]
     asdu_len  io_type  type_id  src  dst  interval relative_time_stamp
289        17    65537      124    0    1  0.000193        26:53.578603
428       506    65537      125    1    0  0.000808        27:35.760142
401        17    65537      124    0    1  0.000494        27:25.386267
830       506    65537      125    1    0  0.000001        29:26.054751
785        17    65537      122    0    1  5.025298        29:15.614414
1          17    65537      122    0    1  0.471748        25:03.769655
759       506    65537      125    1    0  0.000000        29:05.098639
580        17    65537      122    0    1  0.061247        28:17.752857
185        57    65537      125    1    0  0.000001        26:27.417900
16         17    65537      124    0    1  0.000517        25:04.035790
     asdu_len  io_type  type_id  src  dst  interval relative_time_stamp  817  \
289       321    65537      124    0    1  0