In [3]:
# import library
import pandas as pd
import numpy as np
import glob
import json
from sklearn.model_selection import train_test_split
import torch
import swifter


In [4]:
DATA_PATH = '../raw-data/road/road/attacks/'

TEST_PATH = DATA_PATH + 'fuzzing_attack_3.log'


In [5]:
# read log file
df = pd.read_fwf(
    TEST_PATH, 
    delimiter = ' '+ '#' + '('+')',
    skiprows = 1,skipfooter=1,
    usecols = [0,2,3],
    dtype = {'time':'float64', 'aid':str, 'data': str},
    names = ['time','aid', 'data'] 
)

In [6]:
df.head()

Unnamed: 0,time,aid,data
0,1020000000.0,033,123F19256DC127D0
1,1020000000.0,00E,2054160208097380
2,1020000000.0,193,00080803E8080000
3,1020000000.0,107,0000000000000000
4,1020000000.0,FFF,0000000000000000


In [7]:
df[df['aid'] == 'XXX']

Unnamed: 0,time,aid,data


In [8]:
df.isnull().values.any()

False

In [9]:
df['aid'].dtypes

dtype('O')

In [10]:
df.dtypes

time    float64
aid      object
data     object
dtype: object

In [11]:
df.aid = df.aid.apply(lambda x: int(x, 16))
df.data = df.data.apply(lambda x: x.zfill(16)) #pad with 0s on the left for data with dlc < 8
df.time = df.time - df.time.min()

In [12]:
df

Unnamed: 0,time,aid,data
0,0.000000e+00,51,123F19256DC127D0
1,9.536743e-07,14,2054160208097380
2,1.033902e-03,403,00080803E8080000
3,1.034975e-03,263,0000000000000000
4,1.035929e-03,4095,0000000000000000
...,...,...,...
13231,5.478795e+00,51,121EF925CEC127D0
13232,5.478796e+00,14,20541602080973C4
13233,5.479791e+00,403,00080803E8080000
13234,5.480816e+00,263,0000000000000000


In [13]:
# noise canID put in by author
result = df[df['aid'] > 0x700]
print(result)

           time   aid              data
4      0.001036  4095  0000000000000000
21     0.008704  4095  0000000000000000
48     0.021619  4095  0000000000000000
67     0.028300  4095  0000000000000000
68     0.029293  4095  0000000000000000
...         ...   ...               ...
13196  5.467582  4095  0000000000000000
13197  5.467583  4095  0000000000000000
13199  5.468572  4095  0000000000000000
13203  5.469655  4095  0000000000000000
13235  5.480817  4095  0000000000000000

[835 rows x 3 columns]


In [24]:
df = df[df.aid<=0x700]

# Start processing

In [14]:
#open meta data
attack_dict = {}
with open(DATA_PATH + 'capture_metadata.json', "r") as read_file:
    attack_dict = json.load(read_file)

In [15]:
import sys 
import os
sys.path.append(os.path.abspath("../code"))
import helper_functions

In [16]:
def get_all_data(attack_dict):
    df_aggregation = []
    
    for attack_name, metadata in attack_dict.items():    
        if "accelerator" not in attack_name and "metadata" not in attack_name:
            print(f"{attack_name}")
            file_name = '/home/tiendat/transformer-entropy-ids/road/attacks/{}.log'.format(attack_name)
            df_attack = helper_functions.make_can_df(file_name)
            df_attack = helper_functions.add_time_diff_per_aid_col(df_attack)
            # print(df_attack.shape)
            # print(df.dtypes)
            df_aggregation.append(df_attack)
            print(f"Finish preprocess {file_name}")
    return df_aggregation

def get_time_interval(attack_dict):
    attack_metadata = []
    
    for attack_name, metadata in attack_dict.items():    
        if "accelerator" not in attack_name and "metadata" not in attack_name:
            print(f"Finish get time interval of {attack_name}")
            
            # From metadata file
            attack_metadata.append([tuple(attack_dict[attack_name]["injection_interval"])])
    return attack_metadata

In [17]:
def mark_label(df_aggregation, attack_metadata, attack_dict):
    count = 0
    for attack_name, metadata in attack_dict.items():    
        if "accelerator" not in attack_name and "metadata" not in attack_name:
            print(f"Index {count}: {attack_name} --- {attack_dict[attack_name]['injection_id']}")
            
            if attack_dict[attack_name]["injection_id"] != "XXX":
                df_aggregation[count] = helper_functions.add_actual_attack_col(
                    df_aggregation[count], 
                    attack_metadata[count], 
                    int(attack_dict[attack_name]["injection_id"], 16), 
                    attack_dict[attack_name]["injection_data_str"], 
                    attack_name
                )
                print(len(df_aggregation[count][df_aggregation[count]['label'] == True]['label']))
                print(len(df_aggregation[count][df_aggregation[count]['label'] == False]['label']))
            else:
                df_aggregation[count] = helper_functions.add_actual_attack_col(
                    df_aggregation[count], 
                    attack_metadata[count], 
                    "XXX", 
                    attack_dict[attack_name]["injection_data_str"], 
                    attack_name
                )
                print(len(df_aggregation[count][df_aggregation[count]['label'] == True]['label']))
                print(len(df_aggregation[count][df_aggregation[count]['label'] == False]['label']))
            count += 1
    return df_aggregation

def get_time_interval(attack_dict):
    attack_metadata = []
    
    for attack_name, metadata in attack_dict.items():    
        if "accelerator" not in attack_name and "metadata" not in attack_name:
            print(f"Finish get time interval of {attack_name}")
            
            # From metadata file
            attack_metadata.append([tuple(attack_dict[attack_name]["injection_interval"])])
    return attack_metadata

def get_all_data(attack_dict):
    df_aggregation = []
    
    for attack_name, metadata in attack_dict.items():    
        if "accelerator" not in attack_name and "metadata" not in attack_name:
            print(f"{attack_name}")
            file_name = DATA_PATH + '{}.log'.format(attack_name)
            df_attack = helper_functions.make_can_df(file_name)
            df_attack = helper_functions.add_time_diff_per_aid_col(df_attack)

            df_aggregation.append(df_attack)
            print(f"Finish preprocess {file_name}")
    return df_aggregation

In [18]:
df_aggregation = get_all_data(attack_dict)

correlated_signal_attack_1
Finish preprocess ../raw-data/road/road/attacks/correlated_signal_attack_1.log
correlated_signal_attack_1_masquerade
Finish preprocess ../raw-data/road/road/attacks/correlated_signal_attack_1_masquerade.log
correlated_signal_attack_2
Finish preprocess ../raw-data/road/road/attacks/correlated_signal_attack_2.log
correlated_signal_attack_2_masquerade
Finish preprocess ../raw-data/road/road/attacks/correlated_signal_attack_2_masquerade.log
correlated_signal_attack_3
Finish preprocess ../raw-data/road/road/attacks/correlated_signal_attack_3.log
correlated_signal_attack_3_masquerade
Finish preprocess ../raw-data/road/road/attacks/correlated_signal_attack_3_masquerade.log
fuzzing_attack_1
Finish preprocess ../raw-data/road/road/attacks/fuzzing_attack_1.log
fuzzing_attack_2
Finish preprocess ../raw-data/road/road/attacks/fuzzing_attack_2.log
fuzzing_attack_3
Finish preprocess ../raw-data/road/road/attacks/fuzzing_attack_3.log
max_engine_coolant_temp_attack
Finish pr

In [21]:
df_aggregation[0].head()


Unnamed: 0,time,aid,data,time_diffs
3873,1.618163,6,800006400000000,0.999845
6266,2.618064,6,800006400000000,0.999901
8655,3.617806,6,800006400000000,0.999742
11049,4.61781,6,800006400000000,1.000004
13441,5.618164,6,800006400000000,1.000354


In [30]:
attack_metadata = get_time_interval(attack_dict)

Finish get time interval of correlated_signal_attack_1
Finish get time interval of correlated_signal_attack_1_masquerade
Finish get time interval of correlated_signal_attack_2
Finish get time interval of correlated_signal_attack_2_masquerade
Finish get time interval of correlated_signal_attack_3
Finish get time interval of correlated_signal_attack_3_masquerade
Finish get time interval of fuzzing_attack_1
Finish get time interval of fuzzing_attack_2
Finish get time interval of fuzzing_attack_3
Finish get time interval of max_engine_coolant_temp_attack
Finish get time interval of max_engine_coolant_temp_attack_masquerade
Finish get time interval of max_speedometer_attack_1
Finish get time interval of max_speedometer_attack_1_masquerade
Finish get time interval of max_speedometer_attack_2
Finish get time interval of max_speedometer_attack_2_masquerade
Finish get time interval of max_speedometer_attack_3
Finish get time interval of max_speedometer_attack_3_masquerade
Finish get time interv

In [31]:
df_aggregation = mark_label(df_aggregation, attack_metadata, attack_dict)

Index 0: correlated_signal_attack_1 --- 0x6e0
2086
74045
Index 1: correlated_signal_attack_1_masquerade --- 0x6e0
2086
71959
Index 2: correlated_signal_attack_2 --- 0x6e0
2140
63153
Index 3: correlated_signal_attack_2_masquerade --- 0x6e0
2140
61013
Index 4: correlated_signal_attack_3 --- 0x6e0
1264
37897
Index 5: correlated_signal_attack_3_masquerade --- 0x6e0
1264
36633
Index 6: fuzzing_attack_1 --- XXX
36
45549
Index 7: fuzzing_attack_2 --- XXX
15
29858
Index 8: fuzzing_attack_3 --- XXX
3
12182
Index 9: max_engine_coolant_temp_attack --- 0x4e7
42
57871
Index 10: max_engine_coolant_temp_attack_masquerade --- 0x4e7
42
57829
Index 11: max_speedometer_attack_1 --- 0xd0
2459
197542
Index 12: max_speedometer_attack_1_masquerade --- 0xd0
2444
195113
Index 13: max_speedometer_attack_2 --- 0xd0
3169
133604
Index 14: max_speedometer_attack_2_masquerade --- 0xd0
3140
130493
Index 15: max_speedometer_attack_3 --- 0xd0
6126
194216
Index 16: max_speedometer_attack_3_masquerade --- 0xd0
6107
18812

In [32]:
attack_metadata

[[(9.191851, 30.050109)],
 [(9.191851, 30.050109)],
 [(6.830477, 28.225908)],
 [(6.830477, 28.225908)],
 [(4.318482, 16.95706)],
 [(4.318482, 16.95706)],
 [(4.622975, 7.958234)],
 [(11.367798, 13.346811)],
 [(4.824447, 5.470669)],
 [(19.979078, 24.170183)],
 [(19.979078, 24.170183)],
 [(42.009204, 66.449011)],
 [(42.009204, 66.449011)],
 [(16.009225, 47.408246)],
 [(16.009225, 47.408246)],
 [(9.516489, 70.587285)],
 [(9.516489, 70.587285)],
 [(16.627923, 23.347311)],
 [(16.627923, 23.347311)],
 [(13.168608, 36.87663)],
 [(13.168608, 36.87663)],
 [(16.524085, 40.862015)],
 [(16.524085, 40.862015)],
 [(18.929177, 38.836015)],
 [(18.929177, 38.836015)],
 [(20.407134, 57.297253)],
 [(20.407134, 57.297253)],
 [(23.070278, 46.580686)],
 [(23.070278, 46.580686)]]

In [33]:
df_aggregation[0]['label'].unique()

array([False,  True])

In [34]:
count = 0
out_mas = '/home/ntmduy/car-ids/processed-data/road/mar_dataset/'
out_fab = '/home/ntmduy/car-ids/processed-data/road/fab_dataset/'
for attack_name, metadata in attack_dict.items():
    if "accelerator" not in attack_name and "metadata" not in attack_name:
        print(f"Saving {attack_name}_dataset.csv")
        if "masquerade" not in attack_name:
            foutput = '{}/{}_dataset.csv'.format(out_fab, attack_name)
            df_aggregation[count].to_csv(foutput, index=False)
        else:
            foutput = '{}/{}_dataset.csv'.format(out_mas, attack_name)
            df_aggregation[count].to_csv(foutput, index=False)
        count += 1

Saving correlated_signal_attack_1_dataset.csv
Saving correlated_signal_attack_1_masquerade_dataset.csv
Saving correlated_signal_attack_2_dataset.csv
Saving correlated_signal_attack_2_masquerade_dataset.csv
Saving correlated_signal_attack_3_dataset.csv
Saving correlated_signal_attack_3_masquerade_dataset.csv
Saving fuzzing_attack_1_dataset.csv
Saving fuzzing_attack_2_dataset.csv
Saving fuzzing_attack_3_dataset.csv
Saving max_engine_coolant_temp_attack_dataset.csv
Saving max_engine_coolant_temp_attack_masquerade_dataset.csv
Saving max_speedometer_attack_1_dataset.csv
Saving max_speedometer_attack_1_masquerade_dataset.csv
Saving max_speedometer_attack_2_dataset.csv
Saving max_speedometer_attack_2_masquerade_dataset.csv
Saving max_speedometer_attack_3_dataset.csv
Saving max_speedometer_attack_3_masquerade_dataset.csv
Saving reverse_light_off_attack_1_dataset.csv
Saving reverse_light_off_attack_1_masquerade_dataset.csv
Saving reverse_light_off_attack_2_dataset.csv
Saving reverse_light_off_a

In [None]:
df

Unnamed: 0,time,aid,data
0,0.000000,852,1FFF40000003C580
1,0.000001,1505,893FE0070A000080
2,0.000002,651,0000000000000000
3,0.000992,167,005108E5112A00A0
4,0.000994,722,0000500000000000
...,...,...,...
204753,86.459022,560,F700000A7C000E00
204754,86.461950,339,00000000000C1002
204755,86.462905,1634,4E60000040000000
204756,86.462906,412,02FC200002002730


In [None]:
import pandas as pd

def split_hex_string_column(df, column_name):
    """
    Splits a column in a DataFrame that contains strings of 16 hexadecimal characters
    into 8 separate columns, with each new column containing the integer value of
    two characters interpreted as hexadecimal (base 16).
    
    :param df: The DataFrame containing the column to split.
    :param column_name: The name of the column to split.
    """
    # Number of characters to group together from the hex string
    chunk_size = 2
    
    # Create the new columns from the hexadecimal string column
    for i in range(0, 16, chunk_size):
        new_col_name = f'Data{i//2 + 1}'
        # Slice the string into chunks of 2, convert to integer from base 16
        df[new_col_name] = df[column_name].str[i:i+chunk_size].apply(lambda x: int(x, 16))
    
    return df


          HexString  Data1  Data2  Data3  Data4  Data5  Data6  Data7  Data8
0  1A2B3C4D5E6F7A8B     26     43     60     77     94    111    122    139
1  9C0D1E2F3A4B5C6D    156     13     30     47     58     75     92    109


In [None]:
df = split_hex_string_column(df, "data")

KeyError: 'data'