<font size="+0.5">Notebook for transform data format to train the model<font>

# <center> Data transform

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from datetime import datetime

from scipy.signal import savgol_filter
from sklearn.utils import shuffle

In [None]:
# Timestamp form in init data have this format
form = "%d/%m/%Y %H:%M:%S"

In [None]:
# Dictionary to replace events
replacement = {"curva_direita_agressiva": "Aggressive right turn",
               "curva_esquerda_agressiva": "Aggressive left turn",
               "evento_nao_agressivo": "Non-aggressive event",
               "troca_faixa_direita_agressiva": "Aggressive right lane change",
               "aceleracao_agressiva": "Aggressive acceleration",
               "freada_agressiva": "Aggressive breaking",
               "troca_faixa_esquerda_agressiva": "Aggressive left lane change",
               "No label": "No label"}

In [None]:
# Function to replace events
def replace_event(row):
    return replacement[row['event']]

In [None]:
# Load and concatenate accelerometer data with its events
def make_labeled_data(folder_num):
    # Load events and its time
    data_label = pd.read_csv(os.path.join('data', 'data_init', str(folder_num), 'groundTruth.csv'))
    # Load accelerometer data
    data = pd.read_csv(os.path.join('data', 'data_init', str(folder_num), 'aceleracaoLinear_terra.csv'))
    
    # Take first time as start of the trip
    init = datetime.strptime(data.loc[0]['timestamp'], form)
    
    # Function for changing time on its duration of the time by this trip to this record
    def change_timestamp(row):
        return (datetime.strptime(row['timestamp'], form) - init).seconds
    
    data = data.rename(columns={"x": "x_accelerometer", "y": "y_accelerometer", "z": "z_accelerometer"})
    
    data['time_duration'] = data.apply(change_timestamp, axis=1)
    
    for index, row in data_label.iterrows():
        start = row[' inicio']
        finish = row[' fim']
        data.loc[((data['time_duration'] >= start) & (data['time_duration'] < finish)), 'event'] = row['evento']
    
    data['event'] = data['event'].fillna("No label")
    data['event'] = data.apply(replace_event, axis=1)
    
    return data

In [None]:
# Function for creating sequence of events in one dataframe
# Each event has its own number if it is on different time interval
def create_events_sequence(data):
    event_num = 1
    event = data.iloc[0]["event"]
    sequence = []
    
    for index, row in data.iterrows():
        if row["event"] != event:
            event_num += 1
            event = data.loc[index, "event"]
        sequence.append(event_num)
    return sequence

In [None]:
# Function for adding new events to the dictionary of events
def add_events_to_dict(data, dictionary):
    # Create events sequence in this dataframe
    data["event_number"] = create_events_sequence(data)
    # Select only labeled data
    data = data[data["event"] != "No label"]
    # Group data by unique number of event
    data_groupbed = data.groupby("event_number")
    
    # For each unique event number
    for group in np.unique(data["event_number"].values):
        current_group = data_groupbed.get_group(group)
        event_name = current_group["event"].values[0]
        # If dictionary has this event name add dataframe to the list
        # Otherwise create list with this dataframe
        if dictionary.get(event_name):
            dictionary[event_name].append(current_group)
        else:
            dictionary[event_name] = [current_group]
    # Return updated dictionary
    return dictionary

In [None]:
data1 = make_labeled_data(16)
data2 = make_labeled_data(17)
data3 = make_labeled_data(20)
data4 = make_labeled_data(21)

# <center> Data filtering

### <center> Accelerometer data filtering

<font size="+0.5">Look at both curves: initial and filtered and find those <i><b>window lenght</b></i> which filtered curva describe data in the best way.</font>

In [None]:
window_lengths = np.arange(11, 151, 10)
polyorder = 3

In [None]:
for window_length in window_lengths:
    
    data1['x_accelerometer_fil'] = savgol_filter(data1['x_accelerometer'].values, window_length, polyorder)
    data1['y_accelerometer_fil'] = savgol_filter(data1['y_accelerometer'].values, window_length, polyorder)
    data1['z_accelerometer_fil'] = savgol_filter(data1['z_accelerometer'].values, window_length, polyorder)

    fig, ax = plt.subplots(1, 3, figsize=(10, 5))
    
    ax[0].plot(data1[:500]['x_accelerometer'].values, label='x accelerometer')
    ax[0].plot(data1[:500]['x_accelerometer_fil'].values, label='x accelerometer filtered')
    ax[0].legend();
    
    ax[1].plot(data1[:500]['y_accelerometer'].values, label='y accelerometer')
    ax[1].plot(data1[:500]['y_accelerometer_fil'].values, label='y accelerometer filtered')
    ax[1].legend();
    
    ax[2].plot(data1[:500]['z_accelerometer'].values, label='z accelerometer')
    ax[2].plot(data1[:500]['z_accelerometer_fil'].values, label='z accelerometer filtered')
    plt.suptitle(f"Window length: {window_length}", fontsize=20)
    

<font size="+0.5">Look at both curves: initial and filtered and find those <i><b>polyorder</b></i> which filtered curve describe data in the best way.</font>

In [None]:
polyorders = np.arange(2, 15, 1)
window_length = 51

In [None]:
for polyorder in polyorders:
    
    data1['x_accelerometer_fil'] = savgol_filter(data1['x_accelerometer'].values, window_length, polyorder)
    data1['y_accelerometer_fil'] = savgol_filter(data1['y_accelerometer'].values, window_length, polyorder)
    data1['z_accelerometer_fil'] = savgol_filter(data1['z_accelerometer'].values, window_length, polyorder)
    
    fig, ax = plt.subplots(1, 3, figsize=(10, 5))
    
    ax[0].plot(data1[:500]['x_accelerometer'].values, label='x accelerometer')
    ax[0].plot(data1[:500]['x_accelerometer_fil'].values, label='x accelerometer filtered')
    ax[0].legend();
    
    ax[1].plot(data1[:500]['y_accelerometer'].values, label='y accelerometer')
    ax[1].plot(data1[:500]['y_accelerometer_fil'].values, label='y accelerometer filtered')
    ax[1].legend();
    
    ax[2].plot(data1[:500]['z_accelerometer'].values, label='z accelerometer')
    ax[2].plot(data1[:500]['z_accelerometer_fil'].values, label='z accelerometer filtered')
    plt.suptitle(f"Polyorder: {polyorder}", fontsize=20)
    ax[2].legend();
    plt.show();
    

In [None]:
polyorder = 5

<font size="+0.5">Use selected parameters for filtering accelerometer data.</font>

In [None]:
data1['x_accelerometer_fil'] = savgol_filter(data1['x_accelerometer'].values, window_length, polyorder)
data1['y_accelerometer_fil'] = savgol_filter(data1['y_accelerometer'].values, window_length, polyorder)
data1['z_accelerometer_fil'] = savgol_filter(data1['z_accelerometer'].values, window_length, polyorder)

data2['x_accelerometer_fil'] = savgol_filter(data2['x_accelerometer'].values, window_length, polyorder)
data2['y_accelerometer_fil'] = savgol_filter(data2['y_accelerometer'].values, window_length, polyorder)
data2['z_accelerometer_fil'] = savgol_filter(data2['z_accelerometer'].values, window_length, polyorder)

data3['x_accelerometer_fil'] = savgol_filter(data3['x_accelerometer'].values, window_length, polyorder)
data3['y_accelerometer_fil'] = savgol_filter(data3['y_accelerometer'].values, window_length, polyorder)
data3['z_accelerometer_fil'] = savgol_filter(data3['z_accelerometer'].values, window_length, polyorder)

data4['x_accelerometer_fil'] = savgol_filter(data4['x_accelerometer'].values, window_length, polyorder)
data4['y_accelerometer_fil'] = savgol_filter(data4['y_accelerometer'].values, window_length, polyorder)
data4['z_accelerometer_fil'] = savgol_filter(data4['z_accelerometer'].values, window_length, polyorder)

# <center> Features creating

<font size="+0.5">Create feature of mean, median, std and increase/decrease tendency of sliding window.</font>

In [None]:
data1["mean_window_x_accelerometer"] = data1["x_accelerometer_fil"].rolling(8, min_periods=1).mean()
data1["mean_window_y_accelerometer"] = data1["y_accelerometer_fil"].rolling(8, min_periods=1).mean()
data1["mean_window_z_accelerometer"] = data1["z_accelerometer_fil"].rolling(8, min_periods=1).mean()

data2["mean_window_x_accelerometer"] = data2["x_accelerometer_fil"].rolling(8, min_periods=1).mean()
data2["mean_window_y_accelerometer"] = data2["y_accelerometer_fil"].rolling(8, min_periods=1).mean()
data2["mean_window_z_accelerometer"] = data2["z_accelerometer_fil"].rolling(8, min_periods=1).mean()

data3["mean_window_x_accelerometer"] = data3["x_accelerometer_fil"].rolling(8, min_periods=1).mean()
data3["mean_window_y_accelerometer"] = data3["y_accelerometer_fil"].rolling(8, min_periods=1).mean()
data3["mean_window_z_accelerometer"] = data3["z_accelerometer_fil"].rolling(8, min_periods=1).mean()

data4["mean_window_x_accelerometer"] = data4["x_accelerometer_fil"].rolling(8, min_periods=1).mean()
data4["mean_window_y_accelerometer"] = data4["y_accelerometer_fil"].rolling(8, min_periods=1).mean()
data4["mean_window_z_accelerometer"] = data4["z_accelerometer_fil"].rolling(8, min_periods=1).mean()

In [None]:
data1["std_window_x_accelerometer"] = data1["x_accelerometer_fil"].rolling(8, min_periods=1).std()
data1["std_window_y_accelerometer"] = data1["y_accelerometer_fil"].rolling(8, min_periods=1).std()
data1["std_window_z_accelerometer"] = data1["z_accelerometer_fil"].rolling(8, min_periods=1).std()

data2["std_window_x_accelerometer"] = data2["x_accelerometer_fil"].rolling(8, min_periods=1).std()
data2["std_window_y_accelerometer"] = data2["y_accelerometer_fil"].rolling(8, min_periods=1).std()
data2["std_window_z_accelerometer"] = data2["z_accelerometer_fil"].rolling(8, min_periods=1).std()

data3["std_window_x_accelerometer"] = data3["x_accelerometer_fil"].rolling(8, min_periods=1).std()
data3["std_window_y_accelerometer"] = data3["y_accelerometer_fil"].rolling(8, min_periods=1).std()
data3["std_window_z_accelerometer"] = data3["z_accelerometer_fil"].rolling(8, min_periods=1).std()

data4["std_window_x_accelerometer"] = data4["x_accelerometer_fil"].rolling(8, min_periods=1).std()
data4["std_window_y_accelerometer"] = data4["y_accelerometer_fil"].rolling(8, min_periods=1).std()
data4["std_window_z_accelerometer"] = data4["z_accelerometer_fil"].rolling(8, min_periods=1).std()

In [None]:
data1["median_window_x_accelerometer"] = data1["x_accelerometer_fil"].rolling(8, min_periods=1).median()
data1["median_window_y_accelerometer"] = data1["y_accelerometer_fil"].rolling(8, min_periods=1).median()
data1["median_window_z_accelerometer"] = data1["z_accelerometer_fil"].rolling(8, min_periods=1).median()

data2["median_window_x_accelerometer"] = data2["x_accelerometer_fil"].rolling(8, min_periods=1).median()
data2["median_window_y_accelerometer"] = data2["y_accelerometer_fil"].rolling(8, min_periods=1).median()
data2["median_window_z_accelerometer"] = data2["z_accelerometer_fil"].rolling(8, min_periods=1).median()

data3["median_window_x_accelerometer"] = data3["x_accelerometer_fil"].rolling(8, min_periods=1).median()
data3["median_window_y_accelerometer"] = data3["y_accelerometer_fil"].rolling(8, min_periods=1).median()
data3["median_window_z_accelerometer"] = data3["z_accelerometer_fil"].rolling(8, min_periods=1).median()

data4["median_window_x_accelerometer"] = data4["x_accelerometer_fil"].rolling(8, min_periods=1).median()
data4["median_window_y_accelerometer"] = data4["y_accelerometer_fil"].rolling(8, min_periods=1).median()
data4["median_window_z_accelerometer"] = data4["z_accelerometer_fil"].rolling(8, min_periods=1).median()

In [None]:
def roll_column_with_duplicate(column):
    result = np.roll(column, 1)
    result[0] = result[1]
    return result

In [None]:
data1["tendency_window_x_accelerometer"] = roll_column_with_duplicate(data1["mean_window_x_accelerometer"].values) / data1["mean_window_x_accelerometer"]
data1["tendency_window_y_accelerometer"] = roll_column_with_duplicate(data1["mean_window_y_accelerometer"].values) / data1["mean_window_y_accelerometer"]
data1["tendency_window_z_accelerometer"] = roll_column_with_duplicate(data1["mean_window_z_accelerometer"].values) / data1["mean_window_z_accelerometer"]

data2["tendency_window_x_accelerometer"] = roll_column_with_duplicate(data2["mean_window_x_accelerometer"].values) / data2["mean_window_x_accelerometer"]
data2["tendency_window_y_accelerometer"] = roll_column_with_duplicate(data2["mean_window_y_accelerometer"].values) / data2["mean_window_y_accelerometer"]
data2["tendency_window_z_accelerometer"] = roll_column_with_duplicate(data2["mean_window_z_accelerometer"].values) / data2["mean_window_z_accelerometer"]

data3["tendency_window_x_accelerometer"] = roll_column_with_duplicate(data3["mean_window_x_accelerometer"].values) / data3["mean_window_x_accelerometer"]
data3["tendency_window_y_accelerometer"] = roll_column_with_duplicate(data3["mean_window_y_accelerometer"].values) / data3["mean_window_y_accelerometer"]
data3["tendency_window_z_accelerometer"] = roll_column_with_duplicate(data3["mean_window_z_accelerometer"].values) / data3["mean_window_z_accelerometer"]

data4["tendency_window_x_accelerometer"] = roll_column_with_duplicate(data4["mean_window_x_accelerometer"].values) / data4["mean_window_x_accelerometer"]
data4["tendency_window_y_accelerometer"] = roll_column_with_duplicate(data4["mean_window_y_accelerometer"].values) / data4["mean_window_y_accelerometer"]
data4["tendency_window_z_accelerometer"] = roll_column_with_duplicate(data4["mean_window_z_accelerometer"].values) / data4["mean_window_z_accelerometer"]

In [None]:
# Dictionary for storing parts of dataframe by its event
event_dict = {}

In [None]:
event_dict = add_events_to_dict(data1, event_dict)
event_dict = add_events_to_dict(data2, event_dict)
event_dict = add_events_to_dict(data3, event_dict)
event_dict = add_events_to_dict(data4, event_dict)

In [None]:
train_agg_br = pd.concat(event_dict["Aggressive breaking"][:9])
val_agg_br = pd.concat(event_dict["Aggressive breaking"][9:])

train_agg_ac = pd.concat(event_dict["Aggressive acceleration"][:9])
val_agg_ac = pd.concat(event_dict["Aggressive acceleration"][9:])

train_agg_lt = pd.concat(event_dict["Aggressive left turn"][:9])
val_agg_lt = pd.concat(event_dict["Aggressive left turn"][9:])

train_agg_rt = pd.concat(event_dict["Aggressive right turn"][:9])
val_agg_rt = pd.concat(event_dict["Aggressive right turn"][9:])

train_agg_lc = pd.concat(event_dict["Aggressive left lane change"][:3])
val_agg_lc = pd.concat(event_dict["Aggressive left lane change"][3:])

train_agg_rc = pd.concat(event_dict["Aggressive right lane change"][:3])
val_agg_rc = pd.concat(event_dict["Aggressive right lane change"][3:])

train_agg_rc = pd.concat(event_dict["Non-aggressive event"][:11])
val_agg_rc = pd.concat(event_dict["Non-aggressive event"][11:])

In [None]:
train = pd.concat([train_agg_br, train_agg_ac, train_agg_lt, train_agg_rt, train_agg_lc, train_agg_rc, train_agg_rc])
val = pd.concat([val_agg_br, val_agg_ac, val_agg_lt, val_agg_rt, val_agg_lc, val_agg_rc, val_agg_rc])

In [None]:
columns_to_save = ["mean_window_x_accelerometer",
                 "mean_window_y_accelerometer",
                 "mean_window_z_accelerometer",
                 "std_window_x_accelerometer",
                 "std_window_y_accelerometer",
                 "std_window_z_accelerometer",
                 "median_window_x_accelerometer",
                 "median_window_y_accelerometer",
                 "median_window_z_accelerometer",
                 "tendency_window_x_accelerometer",
                 "tendency_window_y_accelerometer",
                 "tendency_window_z_accelerometer",
                 "event"]

In [None]:
train = train.fillna(method="bfill")
val = val.fillna(method="bfill")

In [None]:
train = shuffle(train)
val = shuffle(val)

In [None]:
train[columns_to_save].to_csv('data/train_accelerometer_features.csv', index=False)

In [None]:
val[columns_to_save].to_csv('data/val_accelerometer_features.csv', index=False)