In [None]:
# Import Libraries

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import numpy as np
import pandas as pd

import pickle
import tqdm
import time
import glob
import re
import os

import torch

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
##### check the installed version of PyTorch and CUDA #####
torch_version = torch.__version__

cuda_available = torch.cuda.is_available()
cuda_version = torch.version.cuda if cuda_available else 'not available'

torch_version, cuda_version

In [None]:
##### maximize column display #####
pd.set_option('display.max_colwidth', None)     # display all content within each cell without truncation
pd.set_option('display.max_columns', None)      # display all columns
pd.set_option('display.width', None)            # ensure entire width of DataFrame is displayed

pd.set_option('display.max_rows', None)         # display all rows

# Data Preparation

In [None]:
path = r'..\Transformer\data\input\*.csv'
path_csv = [p for p in glob.glob(path)]

In [None]:
path_csv

In [None]:
def train_test(data):
    
    #Creating Categorical Variable for left and right shoulder width
    data['RightShoulderWidthCat'] = 0
    data.loc[data['RightShoulderWidth'] > 10, 'RightShoulderWidthCat'] = 1
    data['LeftShoulderWidthCat'] = 0
    data.loc[data['LeftShoulderWidth'] > 5, 'LeftShoulderWidthCat'] = 1
    data['Seg_BO'] = 1
    data.loc[data['SegmentType'] == 1, 'Seg_BO'] = 0


    #data = pd.concat([data, pd.get_dummies(data.SegmentType, drop_first=True)], axis=1)
    
    data['Ln_Crash_Precursor_SPF'] =  np.log(data['Crash_Precursor_SPFs'])
    data = data.drop(columns=['RightShoulderWidth', 'LeftShoulderWidth', 'SegmentType', 'Crash_Precursor_SPFs'])

    dict = {'LeftShoulderWidthCat': 'LeftShoulderWidth',
        'RightShoulderWidthCat': 'RightShoulderWidth',
        'Seg_BO': 'SegmentType'
       # 'Pred_Xgboost': 'Crash_Precursor_SPF',
        
        }
 
    data.rename(columns=dict, inplace=True)

    data['measurement_start'] = pd.to_datetime(data.measurement_start)
    
    data.sort_values(by=['SegmentId', 'measurement_start'], inplace=True)
    
    #data.drop(columns='SegmentType', inplace=True)
    data.reset_index(drop=True, inplace=True)

    
    # scale data
    scaler = MinMaxScaler(feature_range=(0, 1))

    columns = data.columns.drop(['measurement_start', 'Crash_Prediction_Index', 'SegmentId'])

    data = pd.concat([data[['measurement_start', 'Crash_Prediction_Index', 'SegmentId']], 
                            pd.DataFrame(scaler.fit_transform(data[columns]), columns=columns)], axis=1)
    
    
    # filter
    #data = data[(data.measurement_start >= pd.Timestamp('2020-01-01 00:00:00')) & 
    #            (data.measurement_start <= pd.Timestamp('2022-12-31 11:59:59'))].reset_index(drop=True)
    
    data = data[data.Crash_Prediction_Index.isin([0, 1])].reset_index(drop=True)
    
    
    # sequence numbering
    difference = data.measurement_start.diff().dt.total_seconds().div(60).fillna(0).astype(int)
    
    data['sequenceNo'] = (difference > 1).cumsum() + 1
    
    data['month'] = data.measurement_start.dt.month

    data = data.dropna()
    
    # split in train and test data on months
    train, test = data[~data.month.isin([10, 11, 12])].reset_index(drop=True), data[data.month.isin([10, 11, 12])].reset_index(drop=True)
    
    
    return train, test

In [None]:
##### function to create sequences #####
def create_sequences(X, y, window):
    
    X_sequences, y_sequences = [], []

    for i in range(X.shape[0] - window):

        X_sequences.append(X.values[i:(i + window)])
        y_sequences.append(y.values[(i + window) - 1])

        
    return np.array(X_sequences), np.array(y_sequences)

In [None]:
def data_sequencing(data, sequence_column, drop_columns, target_column, window):
    
    sequence_nos = sorted(list(data[sequence_column].unique()))

    X, y = [], []

    for sequence_no in sequence_nos:

        data_temp = data[data.sequenceNo == sequence_no].reset_index(drop=True)
        
        if data_temp.shape[0] < window:
            continue
            
        
        X_temp, y_temp = create_sequences(X=data_temp.drop(columns=drop_columns), y=data_temp[[target_column]], 
                                          window=window)
        
        X.append(X_temp); y.append(y_temp)
        
        
    return np.vstack(X), np.vstack(y)      

In [None]:
def data_preparation(path_input, path_output, window, cuda=True):

    df_input = pd.read_csv(path_input, low_memory=True,
                usecols = ['SegmentId', 'measurement_start',
                    '5min_avg_speed', '5min_std_speed',
                   '5min_cv_speed', '5min_avg_volume', '5min_std_volume', '5min_cv_volume',
                   '5min_avg_occupancy', '5min_std_occupancy', '5min_cv_occupancy',
                    '5min_avg_speed_upstream', '5min_std_speed_upstream',
                   '5min_cv_speed_upstream', '5min_avg_volume_upstream',
                   '5min_std_volume_upstream', '5min_cv_volume_upstream',
                   '5min_avg_occupancy_upstream', '5min_std_occupancy_upstream',
                   '5min_cv_occupancy_upstream', '5min_avg_speed_downstream',
                   '5min_std_speed_downstream', '5min_cv_speed_downstream',
                   '5min_avg_volume_downstream', '5min_std_volume_downstream',
                   '5min_cv_volume_downstream', '5min_avg_occupancy_downstream',
                   '5min_std_occupancy_downstream', '5min_cv_occupancy_downstream', 'Segment Length',
                   'ThruLanes', 'SegmentType', 'RightShoulderWidth', 'LeftShoulderWidth',
                    'Turnout', 'Temperature', 'Relative Humidity', 'Wind Speed', 'Cloud Cover', 
                     'Precipitation', 'Snow Depth', 'Visibility',  'Diff_Avg_Volume',
                       'Diff_Avg_Occupancy', 'Diff_Std_Occupancy', 'Diff_CV_Occupancy', 'Diff_Avg_Speed', 
                    'Diff_Std_Speed', 'Diff_CV_Speed', 'Crash_Precursor_SPFs', 'Day', 'Crash_Prediction_Index'])

    df_input = df_input.dropna()

    df_train, df_test = train_test(data=df_input)
    
    
    X_train, y_train = data_sequencing(data=df_train, sequence_column='sequenceNo',
                                       drop_columns=['measurement_start', 'Crash_Prediction_Index', 'SegmentId', 'sequenceNo', 'month'], 
                                       target_column='Crash_Prediction_Index', window=window)
    
    X_test, y_test = data_sequencing(data=df_test, sequence_column='sequenceNo',
                                     drop_columns=['measurement_start', 'Crash_Prediction_Index', 'SegmentId', 'sequenceNo', 'month'], 
                                     target_column='Crash_Prediction_Index', window=window)


    # convert to pytorch tensors
    X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
    X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)

    
    # move tensors to gpu if cuda is available
    if cuda and torch.cuda.is_available():
        device = torch.device('cuda')
        
        X_train, y_train, X_test, y_test = X_train.to(device), y_train.to(device), X_test.to(device), y_test.to(device)
        

    # append to the master file
    if os.path.exists(path_output):
        df_output = torch.load(path_output)
        
        output_X_train, output_y_train, output_X_test, output_y_test = df_output

        # concatenate new data with existing data
        X_train, y_train = torch.cat((output_X_train, X_train), 0), torch.cat((output_y_train, y_train), 0)
        X_test, y_test = torch.cat((output_X_test, X_test), 0), torch.cat((output_y_test, y_test), 0)

        
    # save the concatenated data
    torch.save((X_train, y_train, X_test, y_test), path_output)

In [None]:
path_output = '..\Transformer\data\output\data_sequence_pm.pt'

for i in tqdm.tqdm(range(len(path_csv))):
    
    data_preparation(path_input=path_csv[i], path_output=path_output, window=5)