# # FOR EVALUATION USE BY GRAB

___
# Background:
According to the World Health Organization, approximately 1.35 million people die as a result of a road traffic crash every year. Between 20 and 50 million more suffer non-fatal injuries, with many incurring a disability as a result of their injury.

Road traffic injuries cause considerable economic losses to individuals, their families, and to nations as a whole. These losses arise from the cost of treatment as well as lost productivity for those killed or disabled by their injuries, and for family members who need to take time off work or school to care for the injured. Road traffic crashes cost most countries 3% of their gross domestic product.

In my project, I will derive a classification model primarily through feature engineering to detect dangerous driving trips using time-series telemetry data from Grab. This model will be evaluated based on the AUC-ROC curve.

##### Dataset Obtained from (https://s3-ap-southeast-1.amazonaws.com/grab-aiforsea-dataset/safety.zip)

_______
# Dataset characteristics:

Dataset is a time-series of the following features for every "bookingID"

|Datapoint:     | Description:                                               | 
| ------------- |:-------------:                                             |
|bookingID      |trip id                                                     |
|Label          |1 indicates dangerous driving & 0 indicates safe driving    |

|Features (time-series):     | Description:                          |
| -------------              |:-------------:                        |
|Accuracy                    |accuracy inferred by GPS in meters     | 
|Bearing                     |GPS bearing in degree                  |
|acceleration_x              |accelerometer reading at x axis (m/s2) |
|acceleration_y              |accelerometer reading at y axis (m/s2) |
|acceleration_z              |accelerometer reading at z axis (m/s2) |
|gyro_x                      |gyroscope reading in x axis (rad/s)    |
|gyro_y                      |gyroscope reading in y axis (rad/s)    |
|gyro_z                      |gyroscope reading in z axis (rad/s)    |
|second                      |time of the record by number of seconds|
|Speed                       |speed measured by GPS in m/s           |

____
# Engineered Features:
Features are decided upon primarily through domain knowledge and analysis of correlation with label using a pairplot.

|Engineered Features (Datapoint):|Description: |
| -------------                  |---|
|Accuracy                        |Median Accuracy value with respect to range of timeseries data. If Median Accuracy is not available, Min Accuracy value is used instead 
|Bearing_delta                   |Standard Deviation of the difference between each timeseries datapoint
|acceleration_x                  |Number of threshold crossovers with threshold set at 0
|acceleration_y                  |Number of threshold crossovers with threshold set at Mean of acceleration_y
|acceleration_z                  |Number of threshold crossovers with threshold set at Mean of acceleration_z
|gyro_x                          |Max gyro_x value - Min gyro_z value
|gyro_y                          |Max gyro_y value - Min gyro_z value
|gyro_z                          |Max gyro_z value - Min gyro_z value
|Speed                           |Mean of Speed in timeseries data
|missing_rng                     |Number of missing values in the range of timeseries data

In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import matplotlib.pyplot as plt
from pprint import pprint
sns.set()
from pathlib import Path

In [2]:
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import train_test_split
from keras.models import load_model

Using TensorFlow backend.


# Helper Functions

In [6]:
def threshold_crosssovers(df, metric, threshold):
    f = np.array(list(df[['second', metric]].itertuples(index=False, name=None)))
    g = [(0, int(threshold))] * len(f)
    idx = np.argwhere(np.diff(np.sign(f - g))).flatten()
    return int(len(idx)/2)

def missing_range(df):
    return int(len(set(df['second']) ^ set(range(0, int(df['second'].iloc[-1])))))

def difference(dataset, interval=1):
    diff = []
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return diff

def match_ID_to_label(ID):
    return df_labels[df_labels['bookingID']==ID]['label'].iloc[0]

# Function to convert TimeSeries to Datapoint

In [1]:
def convert_timeseries_to_datapoint(ID):
    ID_df = df_features[df_features['bookingID']==ID]
    ID_df_stats = ID_df.describe()
    
    missing_rng = missing_range(ID_df)
    if missing_rng==0:
        Accuracy = ID_df_stats['Accuracy']['50%']
    else:
        median_idx = int(ID_df['second'].iloc[-1]/2)
        if median_idx in set(ID_df['second']):
            Accuracy = ID_df[ID_df['second']==median_idx]['Accuracy'].iloc[0]
        else:
            Accuracy = ID_df_stats['Accuracy']['min']
    Bearing_delta = np.std(difference(ID_df.loc[:, ['Bearing', 'second']].values))
    acceleration_x = threshold_crosssovers(ID_df, 'acceleration_x', 0) 
    acceleration_y = threshold_crosssovers(ID_df, 'acceleration_y', ID_df_stats['acceleration_y']['mean']) 
    acceleration_z = threshold_crosssovers(ID_df, 'acceleration_z', ID_df_stats['acceleration_z']['mean'])
    gyro_x = ID_df_stats['gyro_x']['max'] - ID_df_stats['gyro_x']['min']
    gyro_y = ID_df_stats['gyro_y']['max'] - ID_df_stats['gyro_y']['min']
    gyro_z = ID_df_stats['gyro_z']['max'] - ID_df_stats['gyro_z']['min']
    Speed = ID_df_stats['Speed']['mean']
    
    return [ID, Accuracy, Bearing_delta, acceleration_x, acceleration_y, acceleration_z, gyro_x, gyro_y, gyro_z, Speed, missing_rng]

# Read Datasets & sort them according to BookingID & seconds(if any)

In [3]:
# Read cleaned features .csv file to ipynb
df_features = pd.read_csv('safety/data_cleaned_features.csv')
df_features = df_features.sort_values(['bookingID', 'second']).reset_index(drop=True)
# Read cleaned labels .csv file to ipynb
df_labels = pd.read_csv('safety/data_cleaned_labels.csv')
df_labels = df_labels.sort_values('bookingID').reset_index(drop=True)

# Find unique_BookingIDs 

In [None]:
unique_BookingIDs = df_labels['bookingID'].unique().tolist()
print(unique_BookingIDs)

# Convert timeseries to Datapoint

In [5]:
col=['BookingID', 'Accuracy', 'Bearing', 
     'acceleration_x', 'acceleration_y', 'acceleration_z', 
     'gyro_x', 'gyro_y', 'gyro_z', 
     'Speed']

eng_features_col = ['ID', 'Accuracy', 'Bearing_delta', 
                    'acceleration_x', 'acceleration_y', 'acceleration_z',
                    'gyro_x', 'gyro_y', 'gyro_z',
                    'Speed', 'missing_rng']

df_datapoints = pd.DataFrame()

for ID in unique_BookingIDs:
    datapoint = pd.DataFrame(convert_timeseries_to_datapoint(ID), index=eng_features_col).transpose()
    df_datapoints = pd.concat([df_datapoints, datapoint],ignore_index=True)
    print(unique_BookingIDs.index(ID))
    
df_datapoints['ID'] = df_datapoints['ID'].astype('int64')
print("df_datapoints shape is ", df_datapoints.shape)
df_datapoints.head()

In [None]:
df_datapoints.to_csv(r'safety/engineered_features.csv', index=False)

# Get df_combined

In [None]:
df_combined = df_datapoints.copy()
df_combined['label'] = df_combined['bookingID'].apply(match_ID_to_label).sort_values('bookingID').reset_index(drop=True)

# Load Model

In [8]:
loaded_model = load_model("train_model_ver1.h5", custom_objects={'auroc':auroc, 'precision':precision, 'recall':recall})
print("Model loaded from disk")

# Get X & y to input into model for prediction

In [11]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler

print(df_combined.shape)
X = df_combined[eng_features_col].values
sc = StandardScaler()
X = sc.fit_transform(X)
X

In [None]:
print(df_labels.shape)
y = list(df_labels['label'])
y = np.array(y)
print(y)

# Get AUC-ROC score & Confusion Matrix

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Set arbitrary_threshold for CM
arbitrary_threshold = 0.5 

y_pred_raw = loaded_model.predict(X)
y_pred = y_train_pred_raw > arbitrary_threshold

fpr_model, tpr_model, thresholds_model = roc_curve(y, y_pred_raw.ravel())
auc_model = auc(fpr_model, tpr_model)
print("AUC-ROC score is %.4f" % mean_auc_score)


from sklearn.metrics import confusion_matrix

plt.figure(figsize = (25,50))
f, axes = plt.subplots(1, 1, figsize=(12, 4))
sns.heatmap(confusion_matrix(y, y_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes)

axes[0].set_title("Confusion Matrix Results", fontsize=15)

<img src="Images/ConfusionMatrix_Eg.png">

(Credits: https://cdn-images-1.medium.com/max/1600/1*Z54JgbS4DUwWSknhDCvNTQ.png)

# Plot AUC-ROC graph

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_model, tpr_model, label='Model (area = {:.3f})'.format(auc_model))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate (Sensitivity/Recall)')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
# Zoom in view of the upper left corner.
plt.figure(2)
plt.xlim(0, 0.5)
plt.ylim(0.5, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_model, tpr_model, label='Model (area = {:.3f})'.format(auc_model))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate (Sensitivity/Recall)')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()