#### Dataset preprocessing

In [7]:
import numpy as np
import pandas as pd

In [8]:
names = ['timestamp', 'canId', 'dlc', 'data0', 'data1', 'data2', 'data3', 'data4', 'data5', 'data6', 'data7', 'flag']

sonata_path = 'survival\\dataset\\Sonata'
sonata_data_flooding = pd.read_csv(sonata_path + '\\Flooding_dataset_SONATA.txt', sep=',', names=names)
sonata_data_fuzzy = pd.read_csv(sonata_path + '\\Fuzzy_dataset_SONATA.txt', sep=',', names=names)
sonata_data_malfunction = pd.read_csv(sonata_path + '\\Malfunction_dataset_SONATA.txt', sep=',', names=names)
sonata_data_normal = pd.read_csv(sonata_path + '\\FreeDrivingData_20180323_SONATA.txt', sep=r'[,\s]+', engine='python', names=names)
sonata_data_normal['flag'] = 'R'

soul_path = 'survival\\dataset\\Soul'
soul_data_flooding = pd.read_csv(soul_path + '\\Flooding_dataset_KIA.txt', sep=',', names=names)
soul_data_fuzzy = pd.read_csv(soul_path + '\\Fuzzy_dataset_KIA.txt', sep=',', names=names)
soul_data_malfunction = pd.read_csv(soul_path + '\\Malfunction153_dataset_KIA.txt', sep=',', names=names)
soul_data_normal = pd.read_csv(soul_path + '\\FreeDrivingData_20180112_KIA.txt', sep=r'[,\s]+', engine='python', names=names)
soul_data_normal['flag'] = 'R'

spark_path = 'survival\\dataset\\Spark'
spark_data_flooding = pd.read_csv(spark_path + '\\Flooding_dataset_Spark.txt', sep=',', names=names)
spark_data_fuzzy = pd.read_csv(spark_path + '\\Fuzzy_dataset_Spark.txt', sep=',', names=names)
spark_data_malfunction = pd.read_csv(spark_path + '\\Malfunction18E_dataset_Spark.txt', sep=',', names=names)
spark_data_normal = pd.read_csv(spark_path + '\\FreeDrivingData_20171231_Spark.txt', sep=r'[,\s]+', engine='python', names=names)
spark_data_normal['flag'] = 'R'

print('Sonata Flooding:', sonata_data_flooding.shape)
print('Sonata Fuzzy:', sonata_data_fuzzy.shape)
print('Sonata Malfunction:', sonata_data_malfunction.shape)
print('Sonata Normal:', sonata_data_normal.shape)
print('---------------------------------')

print('Soul Flooding:', soul_data_flooding.shape)
print('Soul Fuzzy:', soul_data_fuzzy.shape)
print('Soul Malfunction:', soul_data_malfunction.shape)
print('Soul Normal:', soul_data_normal.shape)
print('---------------------------------')

print('Spark Flooding:', spark_data_flooding.shape)
print('Spark Fuzzy:', spark_data_fuzzy.shape)
print('Spark Malfunction:', spark_data_malfunction.shape)
print('Spark Normal:', spark_data_normal.shape)
print('---------------------------------')

Sonata Flooding: (149547, 12)
Sonata Fuzzy: (135670, 12)
Sonata Malfunction: (132651, 12)
Sonata Normal: (117173, 12)
---------------------------------
Soul Flooding: (181901, 12)
Soul Fuzzy: (249990, 12)
Soul Malfunction: (173436, 12)
Soul Normal: (192516, 12)
---------------------------------
Spark Flooding: (120570, 12)
Spark Fuzzy: (65665, 12)
Spark Malfunction: (79787, 12)
Spark Normal: (136934, 12)
---------------------------------


In [9]:
sonata_data = pd.concat([sonata_data_flooding, sonata_data_fuzzy, sonata_data_malfunction, sonata_data_normal], ignore_index=True)
soul_data = pd.concat([soul_data_flooding, soul_data_fuzzy, soul_data_malfunction, soul_data_normal], ignore_index=True)
spark_data = pd.concat([spark_data_flooding, spark_data_fuzzy, spark_data_malfunction, spark_data_normal], ignore_index=True)

print('Sonata:', sonata_data.shape)
print('Soul:', soul_data.shape)
print('Spark:', spark_data.shape)

Sonata: (535041, 12)
Soul: (797843, 12)
Spark: (402956, 12)


In [10]:
print('Sonata NaN in each column:')
print(sonata_data.isna().sum())
print('---------------------------------')

print('Soul NaN in each column:')
print(soul_data.isna().sum())
print('---------------------------------')

print('Spark NaN in each column:')
print(spark_data.isna().sum())

Sonata NaN in each column:
timestamp        0
canId            0
dlc              0
data0            0
data1            0
data2         1200
data3         4796
data4         4796
data5        10793
data6        28792
data7        28792
flag         21595
dtype: int64
---------------------------------
Soul NaN in each column:
timestamp        0
canId            0
dlc              0
data0            0
data1            0
data2          461
data3         2641
data4         6167
data5        18147
data6        43277
data7        43277
flag         31669
dtype: int64
---------------------------------
Spark NaN in each column:
timestamp         0
canId             0
dlc               0
data0             0
data1          2502
data2         10503
data3         19069
data4         35928
data5         62459
data6         87679
data7        137125
flag         106450
dtype: int64


In [11]:
spark_data

Unnamed: 0,timestamp,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7,flag
0,1.513920e+09,04C1,8,00,CC,80,5E,52,08,00,00,R
1,1.513920e+09,04C7,3,10,00,00,R,,,,,
2,1.513920e+09,01E1,7,00,00,00,00,00,00,00,R,
3,1.513920e+09,00C1,8,00,F9,05,41,02,85,8B,91,R
4,1.513920e+09,00C5,8,03,52,0F,1D,C3,F4,03,D4,R
...,...,...,...,...,...,...,...,...,...,...,...,...
402951,1.513920e+09,232,8,00,00,00,08,00,00,00,00,R
402952,1.513920e+09,18E,8,00,00,00,69,A6,9A,06,9A,R
402953,1.513920e+09,191,8,06,9A,06,AA,06,9A,00,00,R
402954,1.513920e+09,0C7,4,00,07,02,20,,,,,R


In [12]:
for index, row in spark_data.iterrows():
    if pd.isnull(row.loc['flag']):
        for i in range(len(row) - 1, -1, -1):
            if not pd.isnull(row.iloc[i]):
                spark_data.loc[index, 'flag'] = row.iloc[i]
                column_name = spark_data.columns[i]
                spark_data.loc[index, column_name] = np.nan
                break


In [13]:
spark_data

Unnamed: 0,timestamp,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7,flag
0,1.513920e+09,04C1,8,00,CC,80,5E,52,08,00,00,R
1,1.513920e+09,04C7,3,10,00,00,,,,,,R
2,1.513920e+09,01E1,7,00,00,00,00,00,00,00,,R
3,1.513920e+09,00C1,8,00,F9,05,41,02,85,8B,91,R
4,1.513920e+09,00C5,8,03,52,0F,1D,C3,F4,03,D4,R
...,...,...,...,...,...,...,...,...,...,...,...,...
402951,1.513920e+09,232,8,00,00,00,08,00,00,00,00,R
402952,1.513920e+09,18E,8,00,00,00,69,A6,9A,06,9A,R
402953,1.513920e+09,191,8,06,9A,06,AA,06,9A,00,00,R
402954,1.513920e+09,0C7,4,00,07,02,20,,,,,R


In [14]:
for index, row in sonata_data.iterrows():
    if pd.isnull(row.loc['flag']):
        for i in range(len(row) - 1, -1, -1):
            if not pd.isnull(row.iloc[i]):
                sonata_data.loc[index, 'flag'] = row.iloc[i]
                column_name = sonata_data.columns[i]
                sonata_data.loc[index, column_name] = np.nan
                break

In [15]:
for index, row in soul_data.iterrows():
    if pd.isnull(row.loc['flag']):
        for i in range(len(row) - 1, -1, -1):
            if not pd.isnull(row.iloc[i]):
                soul_data.loc[index, 'flag'] = row.iloc[i]
                column_name = soul_data.columns[i]
                soul_data.loc[index, column_name] = np.nan
                break

In [16]:
print('Sonata NaN in each column:')
print(sonata_data.isna().sum())
print('---------------------------------')

print('Soul NaN in each column:')
print(soul_data.isna().sum())
print('---------------------------------')

print('Spark NaN in each column:')
print(spark_data.isna().sum())

Sonata NaN in each column:
timestamp        0
canId            0
dlc              0
data0            0
data1            0
data2         4796
data3         4796
data4         4796
data5        28792
data6        28792
data7        28792
flag             0
dtype: int64
---------------------------------
Soul NaN in each column:
timestamp        0
canId            0
dlc              0
data0            0
data1            0
data2         1719
data3         5154
data4         8935
data5        43277
data6        43277
data7        43277
flag             0
dtype: int64
---------------------------------
Spark NaN in each column:
timestamp         0
canId             0
dlc               0
data0             0
data1          6691
data2         16905
data3         22703
data4         58126
data5         69733
data6        117506
data7        170051
flag              0
dtype: int64


In [17]:
sonata_data = sonata_data.fillna(-1)
soul_data = soul_data.fillna(-1)
spark_data = spark_data.fillna(-1)

In [18]:
sonata_data.to_csv('sonata_data.csv', index=False)
soul_data.to_csv('soul_data.csv', index=False)
spark_data.to_csv('spark_data.csv', index=False)