In [1]:
DATA_PATH = './data/car-hacking/DoS_dataset.csv'
# CONSTANT
# CURRENT PATH : ./notebooks
DATA_PROPERTY = ['Timestamp', 'canID', 'DLC', 
                           'Data0', 'Data1', 'Data2', 
                           'Data3', 'Data4', 'Data5', 
                           'Data6', 'Data7', 'Flag']
ATTACK_TYPES = ['DoS', 'Fuzzy', 'gear', 'RPM']

DATA_META = { 'Timestamp': 'float64', 'canID': 'object', 'DLC': 'int64', 'Data0': 'object', 'Data1': 'object', 'Data2': 'object', 'Data3': 'object', 'Data4': 'int64', 'Data5': 'object', 'Data6': 'object', 'Data7': 'object', 'Flag': 'object'}


In [2]:
import pandas as pd

In [3]:
raw_df = pd.read_csv(DATA_PATH, header=None, names=DATA_PROPERTY,dtype={'Data4': 'object'})

In [4]:
raw_df.head()

Unnamed: 0,Timestamp,canID,DLC,Data0,Data1,Data2,Data3,Data4,Data5,Data6,Data7,Flag
0,1478198000.0,0316,8,05,21,68,09,21,21,00,6f,R
1,1478198000.0,018f,8,fe,5b,00,00,0,3c,00,00,R
2,1478198000.0,0260,8,19,21,22,30,8,8e,6d,3a,R
3,1478198000.0,02a0,8,64,00,9a,1d,97,02,bd,00,R
4,1478198000.0,0329,8,40,bb,7f,14,11,20,00,14,R


In [5]:
def hex_to_int(hex_value):
    return int(hex_value, base=16)

def hex_string_to_array(hex_string):
    return list(map(hex_to_int, hex_string))

In [26]:
def split_into_list(string, type='cid'):
    # print(payload)
    res = []
    if type == 'payload':
        for i in range(7):
            res.append(string[:2])
            string = string[2:]
        res.append(string[-2:])
    else:
        hex_cid = '0' + hex(int(string, 16))[2:] if len(hex(int(string, 16))[2:]) == 3 else '00' + hex(int(string, 16))[2:] if len(hex(int(string, 16))[2:]) == 2 else '000' + hex(int(string, 16))[2:]
        for i in range(3):
            res.append(hex_cid[:1])
            hex_cid = hex_cid[1:]
        res.append(hex_cid[-1:])
    return hex_string_to_array(res)


In [7]:
#this function to convert CAN_ID in hex to BIT
def convert_canid_bits(cid):
    try:
        #s = bin(int(str(cid), 16))[2:].zfill(29)
        #bits = ''.join(list(map(int, list(s))))
        return int(cid, 16)
    except:
        return None

In [8]:
def fill_flag(sample):
    if not isinstance(sample['Flag'], str):
        col = 'Data' + str(sample['DLC'])
        sample['Flag'], sample[col] = sample[col], sample['Flag']
    return sample

In [9]:
from numpy.lib.stride_tricks import as_strided

def sliding_window(data, win=29, s=1):
    itemsize  = data.values.itemsize
    N = len(data)
    sliding_data = as_strided(data, shape=((N - win) // s + 1, win), strides=(itemsize*s, itemsize))
    return sliding_data

In [10]:
import torch.nn.functional as F
from torch.utils.data import Dataset
from copy import copy, deepcopy
import math
import numpy as np

In [11]:
#CONSTANT
GRAN = 1e-7
LOG_E = 2
PAD_SIZE = 15

In [29]:
#turn raw data into more understandable format
def process_data(df):
    print("Starting processing data")
    df = df.apply(fill_flag, axis=1)
    df.sort_values(['Timestamp', "canID", "Flag"], ascending=True)
    df["canBits"] = df.canID.apply(split_into_list)
    df['Flag'] = df['Flag'].apply(lambda x: True if x == 'T' else False)

    # data
    num_data_bytes = 8
    for x in range(num_data_bytes):
        df['Data'+str(x)] = df['Data'+str(x)].map(lambda x: int(x, 16), na_action='ignore')

    
    data_cols = ['Data{}'.format(x) for x in range(num_data_bytes)]
    df = df.fillna(0)
    df[data_cols] = df[data_cols].astype(int) 
    df['Data'] = df[data_cols].values.tolist()
        
    print('Processing: DONE', df.head(1))

    print('Aggregate data -----------------')
    WINDOW_SIZE = 16
    STRIDE=16
    output_shape = ((len(df) - WINDOW_SIZE) // STRIDE + 1, WINDOW_SIZE)
    timestamp = sliding_window(df.Timestamp, win=WINDOW_SIZE, s=STRIDE)
    canid = sliding_window(df.canBits, win=WINDOW_SIZE, s=STRIDE)
    data = sliding_window(df.Data, win=WINDOW_SIZE, s=STRIDE)
    label = as_strided(df.Flag, output_shape, (1*WINDOW_SIZE, 1))

    for i in range(len(timestamp)):
        ts = timestamp[i]
        
        len_timestamp = len(ts)

        for i in range(len_timestamp):
            value = round(math.log(round(ts[i] / GRAN) + 1, LOG_E))
            ts[i] = value
        for j in range(PAD_SIZE - len_timestamp):
            timestamp = np.append(timestamp, timestamp[len_timestamp - 1])
    
    for i in range(len(data)):
        ts = timestamp[i]
        len_data = len(data[i])
        cid = canid[i]
        d = data[i]

        for j in range(len_data):
            data[i][j] = np.concatenate(([ts[j]],cid[j],d[j],[0,0,0]))
    
    df = pd.DataFrame({
        # 'timestamp': pd.Series(timestamp.tolist()), 
        # 'header': pd.Series(canid.tolist()), 
        # 'payload': pd.Series(data.tolist()),
        'data': pd.Series(data.tolist()),
        'label': pd.Series(label.tolist())
    }, index= range(len(canid)))

    print(data.shape)

    df['label'] = df['label'].apply(lambda x: 1 if any(x) else 0)
    
    print("Aggregating data: Done")
    print('#Normal: ', df[df['label'] == 0].shape[0])
    print('#Attack: ', df[df['label'] != 0].shape[0])

    return df[['data', 'label']].reset_index().drop(['index'], axis=1)

In [30]:
df = process_data(raw_df.head(2000))

Starting processing data
Processing: DONE       Timestamp canID  DLC  Data0  Data1  Data2  Data3  Data4  Data5  Data6  \
0  1.478198e+09  0316    8      5     33    104      9     33     33      0   

   Data7   Flag       canBits                             Data  
0    111  False  [0, 3, 1, 6]  [5, 33, 104, 9, 33, 33, 0, 111]  
Aggregate data -----------------
(125, 16)
Aggregating data: Done
#Normal:  92
#Attack:  33


In [35]:
len(df.head(1)['data'][0])

16

In [36]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df.head(100), test_size=0.2)

In [37]:
y_train = train['label']
y_train

69    0
85    0
61    0
38    0
78    0
     ..
72    0
21    0
14    0
8     0
37    0
Name: label, Length: 80, dtype: int64

In [38]:
x_train = train
x_train.head()

Unnamed: 0,data,label
69,"[[54.0, 0.0, 3.0, 5.0, 0.0, 5.0, 32.0, 196.0, ...",0
85,"[[54.0, 0.0, 4.0, 11.0, 1.0, 41.0, 39.0, 39.0,...",0
61,"[[54.0, 0.0, 0.0, 10.0, 1.0, 128.0, 137.0, 0.0...",0
38,"[[54.0, 0.0, 3.0, 7.0, 0.0, 0.0, 32.0, 0.0, 0....",0
78,"[[54.0, 0.0, 4.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0...",0


In [39]:
x_train.drop("label", axis=1)

Unnamed: 0,data
69,"[[54.0, 0.0, 3.0, 5.0, 0.0, 5.0, 32.0, 196.0, ..."
85,"[[54.0, 0.0, 4.0, 11.0, 1.0, 41.0, 39.0, 39.0,..."
61,"[[54.0, 0.0, 0.0, 10.0, 1.0, 128.0, 137.0, 0.0..."
38,"[[54.0, 0.0, 3.0, 7.0, 0.0, 0.0, 32.0, 0.0, 0...."
78,"[[54.0, 0.0, 4.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
...,...
72,"[[54.0, 0.0, 3.0, 2.0, 9.0, 64.0, 187.0, 127.0..."
21,"[[54.0, 0.0, 2.0, 6.0, 0.0, 25.0, 34.0, 34.0, ..."
14,"[[54.0, 0.0, 2.0, 10.0, 0.0, 100.0, 0.0, 154.0..."
8,"[[54.0, 0.0, 3.0, 1.0, 6.0, 5.0, 34.0, 96.0, 9..."


In [40]:
x_train.to_csv("./data/car-hacking/processed/x_train.csv")

In [41]:
y_train.to_csv("./data/car-hacking/processed/y_train.csv")

In [42]:
x_test = test
y_test = test["label"]
x_test.drop("label", axis=1)
x_test.to_csv("./data/car-hacking/processed/x_test.csv")
y_test.to_csv("./data/car-hacking/processed/y_test.csv")

In [43]:
from torch import nn
from torch.nn.utils.parametrizations import spectral_norm

In [44]:
main_model = nn.Sequential(
    # 3 * 32 * 32
    spectral_norm(nn.Conv2d(1, 16, 4, 2, 1)),
    nn.LeakyReLU(0.2),
    # 16 * 16 * 16
    #spectral_norm(nn.Conv2d(16, 32, 4, 2, 1)),
    #nn.LeakyReLU(0.2),
    # 32 * 8 * 8r
    spectral_norm(nn.Conv2d(16, 32, 4, 2, 1)),
    nn.LeakyReLU(0.2),
    # 64 * 4 * 4
    spectral_norm(nn.Conv2d(32, 64, 4, 1, 0)),
    # 128 * 1 * 1
    nn.LeakyReLU(0.2),
    nn.Flatten(),
    nn.Linear(64, 2),
    nn.Softmax(dim=1),
)

In [45]:
import torch
from torchsummary import summary

In [46]:
if torch.cuda.is_available():
    main_model.cuda()

In [47]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [48]:
summary(main_model, input_size=(1,16,16))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
     _SpectralNorm-1              [-1, 1, 4, 4]               0
ParametrizedConv2d-2             [-1, 16, 8, 8]             272
     _SpectralNorm-3              [-1, 1, 4, 4]               0
     _SpectralNorm-4              [-1, 1, 4, 4]               0
     _SpectralNorm-5              [-1, 1, 4, 4]               0
     _SpectralNorm-6              [-1, 1, 4, 4]               0
         LeakyReLU-7             [-1, 16, 8, 8]               0
     _SpectralNorm-8             [-1, 16, 4, 4]               0
ParametrizedConv2d-9             [-1, 32, 4, 4]           8,224
    _SpectralNorm-10             [-1, 16, 4, 4]               0
    _SpectralNorm-11             [-1, 16, 4, 4]               0
    _SpectralNorm-12             [-1, 16, 4, 4]               0
    _SpectralNorm-13             [-1, 16, 4, 4]               0
        LeakyReLU-14             [-1, 3