## Import Libraries

In [18]:
from __future__ import print_function

import glob
import os
import random
import pickle

import numpy as np
import torch

from sklearn.model_selection import train_test_split

In [19]:
seed = 42

In [20]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)

## Load Data

In [21]:
# Setting of preprocessed data path
PATH_none_crash_prep = '../../dataset/data_preprocessed/None-crash'
PATH_vulner_prep = '../../dataset/data_preprocessed/Vulner'

none_crash_drive_list = glob.glob(PATH_none_crash_prep + "/*")
vulner_drive_list = glob.glob(PATH_vulner_prep + "/*")

In [22]:
none_crash_data_list = list()
vulner_data_list = list()

for PATH_drive in none_crash_drive_list:
    none_crash_data_list += glob.glob(PATH_drive + "/*")

for PATH_drive in vulner_drive_list:
    vulner_data_list += glob.glob(PATH_drive + "/*")

In [23]:
print("Number of Drives")
print("None-crash:", len(none_crash_drive_list))
print("Vulner:", len(vulner_drive_list))

print(f"None-crash: {len(none_crash_data_list)}")
print(f"Vulner: {len(vulner_data_list)}")

Number of Drives
None-crash: 51
Vulner: 91
None-crash: 30498
Vulner: 3788


### Split data into Train & Test

In [24]:
none_crash_labels = ['None-crash' for _ in none_crash_data_list]
vulner_labels = ['Vulner' for _ in vulner_data_list]

In [25]:
none_crash_train_list, none_crash_test_list = train_test_split(none_crash_data_list, 
                                                                test_size=0.1,
                                                                stratify=none_crash_labels,
                                                                random_state=seed)

vulner_train_list, vulner_test_list = train_test_split(vulner_data_list, 
                                                        test_size=0.1,
                                                        stratify=vulner_labels,
                                                        random_state=seed)

In [26]:
print("Train None-crash:", len(none_crash_train_list))
print("Test None-crash:", len(none_crash_test_list))
print("Train Vulner:", len(vulner_train_list))
print("Test Vulner:", len(vulner_test_list))

Train None-crash: 27448
Test None-crash: 3050
Train Vulner: 3409
Test Vulner: 379


# Save splited data

In [27]:
with open('train_none_crash.pickle', 'wb') as f:
	pickle.dump(none_crash_train_list, f, pickle.HIGHEST_PROTOCOL)
	
with open('test_none_crash.pickle', 'wb') as f:
	pickle.dump(none_crash_test_list, f, pickle.HIGHEST_PROTOCOL)
	
with open('train_vulner.pickle', 'wb') as f:
	pickle.dump(vulner_train_list, f, pickle.HIGHEST_PROTOCOL)
	
with open('test_vulner.pickle', 'wb') as f:
	pickle.dump(vulner_test_list, f, pickle.HIGHEST_PROTOCOL)