##### First we do a little testing!

In [139]:
import os, sys
import pandas as pd

First we read the CSV file into a pandas dataframe!

In [2]:
with open('CANFD_Malfunction.csv', 'r') as f:
    data = [row.strip().split(',') for row in f.readlines()]

In [3]:
df = pd.DataFrame(data)

Then we fix the data length being different by adding some zeroes to the 64 data fields according to the DLC. We also convert the data from HEX to decimal. Finally, we map R:0 and T:1

In [132]:
processed_data = []

for i, row in df.iterrows():
    timestamp = float(row[0])
    arb_id = int(row[1], 16)
    dlc = int(row[2])  
    data_padding = [int(0)] * (64 - dlc) # Pad with zeros to make 64 data fields
    data_cols = [int(val, 16) for val in row[3:3 + dlc]] # Extract data columns and convert to decimal
    data = data_padding + data_cols
    label = 1 if row.iloc[3+dlc] == 'T' else 0
    processed_row = [timestamp, arb_id] + data + [label]
    processed_data.append(processed_row)
    if i >500: # 500 entries because this is a test
        break

processed_df = pd.DataFrame(processed_data, columns=['Timestamp', 'Arbitration_ID'] + [f'Data[{i}]' for i in range(len(data))] + ['Label'])

Then we see what the first 20 rows with the same ID look like! 

In [133]:
processed_df.sort_values(by=['Arbitration_ID', 'Timestamp']).head(20)

Unnamed: 0,Timestamp,Arbitration_ID,Data[0],Data[1],Data[2],Data[3],Data[4],Data[5],Data[6],Data[7],...,Data[55],Data[56],Data[57],Data[58],Data[59],Data[60],Data[61],Data[62],Data[63],Label
20,3581.0073,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
44,3581.01723,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
70,3581.02729,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
94,3581.03727,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
117,3581.0473,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
145,3581.0573,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
168,3581.06726,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
194,3581.07724,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
217,3581.08739,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
245,3581.09725,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0


Then we replace the timestamp with a new feature called dTIME. dTIME measures the time between the current message and the last message with the same Arbitration_ID. This is because CAN messages are periodic. 

In [137]:
further_processed_data = []

# Iterate over each group
for group_id, group_df in processed_df.groupby('Arbitration_ID'):
    # Sort by Timestamp within each group
    group_df = group_df.sort_values(by='Timestamp', ascending=True)
    # Calculate dTIME (time difference between consecutive messages with the same Arbitration_ID)
    group_df['dTIME'] = group_df['Timestamp'].diff().fillna(0)
    group_df = group_df.drop(columns=['Timestamp'])
    group_df = group_df[['dTIME', 'Arbitration_ID'] + [col for col in group_df.columns if col != 'Arbitration_ID' and col != 'dTIME']]
    further_processed_data.append(group_df)
    
further_processed_df = pd.concat(further_processed_data)

Then we visualize the same 20 rows again! 

In [138]:
further_processed_df.sort_values(by=['Arbitration_ID', 'dTIME']).head(20)

Unnamed: 0,dTIME,Arbitration_ID,Data[0],Data[1],Data[2],Data[3],Data[4],Data[5],Data[6],Data[7],...,Data[55],Data[56],Data[57],Data[58],Data[59],Data[60],Data[61],Data[62],Data[63],Label
20,0.0,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
245,0.00986,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
400,0.00992,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
44,0.00993,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
168,0.00996,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
353,0.00996,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
94,0.00998,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
194,0.00998,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
423,0.00999,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0
145,0.01,64,0,0,0,0,0,0,0,0,...,255,0,0,0,0,0,0,0,0,0


##### Now, we write the code that will process entire CSV files! 

In [153]:
folder_path = '/Users/valencycolaco/Desktop/CAN-FD_Intrusion_Dataset/'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

In [160]:
def process_csv(filename):
    
    print("Dataset Processing: ", filename)
    
    processed_data, further_processed_data = [], []
    
    with open(filename, 'r') as f:
        data = [row.strip().split(',') for row in f.readlines()]
        
    df = pd.DataFrame(data)

    for _, row in df.iterrows():
        timestamp = float(row[0])
        arb_id = int(row[1], 16)
        dlc = int(row[2])  
        data_padding = [int(0)] * (64 - dlc) # Pad with zeros to make 64 data fields
        data_cols = [int(val, 16) for val in row[3:3 + dlc]] # Extract data columns and convert to decimal
        data = data_padding + data_cols
        label = 1 if row.iloc[3+dlc] == 'T' else 0
        processed_row = [timestamp, arb_id] + data + [label]
        processed_data.append(processed_row)

    processed_df = pd.DataFrame(processed_data, columns=['Timestamp', 'Arbitration_ID'] + [f'Data[{i}]' for i in range(len(data))] + ['Label'])
    
    # Iterate over each group
    for _, group_df in processed_df.groupby('Arbitration_ID'):
        # Sort by Timestamp within each group
        group_df = group_df.sort_values(by='Timestamp', ascending=True)
        # Calculate dTIME (time difference between consecutive messages with the same Arbitration_ID)
        group_df['dTIME'] = group_df['Timestamp'].diff().fillna(0)
        group_df = group_df.drop(columns=['Timestamp'])
        group_df = group_df[['dTIME', 'Arbitration_ID'] + [col for col in group_df.columns if col != 'Arbitration_ID' and col != 'dTIME']]
        further_processed_data.append(group_df)
        
    further_processed_df = pd.concat(further_processed_data)   
    further_processed_df.to_csv("processed_"+filename, index=False) 
    print("Dataset Processed: ", filename)
    del further_processed_df, processed_df, df
    

In [161]:
for file in csv_files:
    process_csv(file)

Dataset Processing:  CANFD_Malfunction.csv
Dataset Processed:  CANFD_Malfunction.csv
Dataset Processing:  CANFD_Fuzzing.csv
Dataset Processed:  CANFD_Fuzzing.csv
Dataset Processing:  CANFD_Flooding.csv
Dataset Processed:  CANFD_Flooding.csv
