# Grab Safety Analysis

In [None]:
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Configure data visualization
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

# Background and Objective
Safety is an important aspect for online transportation. We want customer to feel safe riding Grab so that they could do other things on the way without worry. 

Customer could feel unsafe because of the driver behaviour or driving skill, eg: 
- Driver using unpopular shortcuts 
- Driver talk with other person in phone or with customers
- Driver keep seeing GPS and don't pay attention to the road. 
- Sleepy 
- Speeding
- Harsh acceleration, braking, or cornering
- Run over speed bump / hole with high speed.

If we could quickly detect when the driver start driving unsafely, we could remind the driver real-time to prevent something bad happend.  

# Raw Data

In [None]:
import glob

fetaures_raw_files = glob.glob('./data/features/*.csv')
featrues_raw_list = []

for file_name in fetaures_raw_files[:1]: # remove index to take data from all part. 
    df = pd.read_csv(file_name, index_col=None, header=0)
    featrues_raw_list.append(df)
    
features_raw = pd.concat(featrues_raw_list, axis=0, ignore_index=True)
labels = pd.read_csv('./data/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

del featrues_raw_list

In [None]:
print('Features data sample:')
features_raw.head()
print('Labels data sample:')
labels.head()

In [None]:
features_raw.loc[features_raw.bookingID==1202590843006,:].sort_values(by='second').head()

## Feature description
### Booking id
- Trip id
- Possibly relate to service type (GrabCar/Bike) ?

### Accuracy
- Accuracy inferred by GPS in meters
- Affect uncertainty level for GPS Bearing and Speed

### Bearing
- GPS bearing in degree
- The degree of the GPS movement relative from North
- Could relate with GPS accuracy. Less accurate means more uncertainty in the real speed.
- Beware that 10 degree to 340 degree is 30 degree difference

### Acceleration (x, y, z)
- Accelerometer reading at certain axis (m/s2)
- [Youtube explaination about how Accelerometer works](https://www.youtube.com/watch?v=KZVgKu6v808)
- Concern: how could we factor out gravity acceleration.
- Concern: phone orientation

### Gyro (x, y, z)
- Gyroscope reading in certain axis (rad/s)
- Measure angular velocity / speed of rotation
- [Explaination about how Gyroscope works](https://learn.sparkfun.com/tutorials/gyroscope/all)
- Concern: Gyroscope bias, usually caused by heat

### Second
- Time of the record by number of seconds
- Remember that it is not constant, eg: per 2s. If we want to use lag, we need to account for time delta or interpolate it.

### Speed
- Speed measured by GPS in m/s
- Could relate with GPS accuracy. Less accurate means more uncertainty in the real speed.

# Basic Data Preprocess

### Split to train and test set by bookingID
We only split the data become two set. Train and test set and we use the test set as validation set as well. 
The data is split based on `bookingID` to make sure there is no data leak from train to test set.

In [None]:
train_dataset_ratio = 0.7

all_booking_ids = features_raw.bookingID.unique()
np.random.seed(1)
train_booking_id = np.random.choice(all_booking_ids, 
                                    size = int(train_dataset_ratio * all_booking_ids.shape[0]), 
                                    replace=False)

train_dataset = features_raw.loc[features_raw.bookingID.isin(train_booking_id), :].copy(deep=False)
train_label = labels.loc[labels.bookingID.isin(train_booking_id), :].copy(deep=False)
test_dataset = features_raw.loc[~features_raw.bookingID.isin(train_booking_id), :].copy(deep=False)
test_label = labels.loc[~labels.bookingID.isin(train_booking_id), :].copy(deep=False)

In [None]:
print('Safe and un-save trips')
labels.label.value_counts()
print('\n')
print('#BookingID with more than 1 rows in labels: {0}'.format((labels.bookingID.value_counts() > 1).sum()))

### Handle double label

In [None]:
def preproces_label(labels):
    return labels.groupby(['bookingID']).max().reset_index().copy(deep=False)

In [None]:
train_label = preproces_label(train_label)
test_label = preproces_label(test_label)

# Feature Engineering

### Sequence and Sort

In [None]:
def ensure_sorted(dataset):
    dataset_copy = dataset.copy(deep=False)
    
    dataset_copy['sequence'] = dataset_copy[
        ['bookingID', 'second']
    ].groupby('bookingID').rank(ascending=True, method='first')

    dataset_copy = dataset_copy.sort_values(by=['bookingID', 'second'])
    return dataset_copy

### Gyroscope Data

- Usually phone is not rotating all the time and the value of gyroscope will be 0.
- [Knowing that there is a bias of gyroscope reading](https://base.xsens.com/hc/en-us/articles/209611089-Understanding-Sensor-Bias-offset-), we could use median to find the expected reading while the phone is in stable position. 

In [None]:
sns.distplot(features_raw.loc[features_raw.bookingID==1477468749954,['gyro_x']])
print('Gyroscope reading, x-axis bias:', features_raw.loc[features_raw.bookingID==1477468749954,['gyro_x']].mean());

In [None]:
def gyro_data_enrich(dataset):
    enriched_dataset = dataset.copy(deep=False)
    enriched_dataset = ensure_sorted(enriched_dataset)
    
    gyro_cols = ['gyro_x', 'gyro_y', 'gyro_z']
    
    # Find gyroscope bias / stable values
    for col in gyro_cols:
        if (col+'_stable') in enriched_dataset.columns:
            continue
        agg_stable = enriched_dataset.groupby('bookingID')[col].mean().reset_index()
        agg_stable.columns = ['bookingID', col+'_stable']
        enriched_dataset = pd.merge(enriched_dataset, agg_stable, how='left', on='bookingID', validate='m:1')

    # Gyroscope filtered / calibrated values
    for col in gyro_cols:
        if (col+'_filtered') in enriched_dataset.columns:
            continue
        enriched_dataset[col+'_filtered'] = enriched_dataset[col] - enriched_dataset[col+'_stable']
    
    # Gyroscope magnitude of calibrated values
    enriched_dataset['gyro_filtered_magnitude'] = np.sqrt(enriched_dataset['gyro_x_filtered']**2 + \
                                                          enriched_dataset['gyro_y_filtered']**2 + \
                                                          enriched_dataset['gyro_z_filtered']**2)
    
    # Gyroscope magnitude standard deviation
    agg_std = enriched_dataset.groupby('bookingID')['gyro_filtered_magnitude'].std().reset_index()
    agg_std.columns = ['bookingID', 'gyro_filtered_std']
    enriched_dataset = pd.merge(enriched_dataset, agg_std, how='left', on='bookingID', validate='m:1')
        
    return enriched_dataset

In [None]:
# gyro_data_enrich(train_dataset).head()

### Accelerometer Data

- Accelerometer readings depends on gravity
- [Phone orientation](https://www.digikey.com/en/articles/techzone/2011/may/using-an-accelerometer-for-inclination-sensing) could change over time and change the gravity acceleration for each axis

Could we?
- Handle accelerometer bias? Gravity is not always 9.8. It depends on height and accelerometer bias. 
- Distinguish between vehicle movement and user moving the phone?
- Normalize all data assuming all phones are having the same orientation?

In [None]:
def accel_data_enrich(dataset, smoothing: int=3):
    enriched_dataset = dataset.copy(deep=False)
    enriched_dataset = ensure_sorted(enriched_dataset)
    
    accel_cols = pd.Series(['acceleration_x', 'acceleration_y', 'acceleration_z'])
    
    # Rolling mean of accleration data to find gravity
    rolling_mean_data = enriched_dataset.groupby('bookingID').apply(
        lambda x: x[
            accel_cols
        ].rolling(window=smoothing, min_periods=1, center=True).mean())
    rolling_mean_data.columns = accel_cols + '_gravity'
    enriched_dataset = pd.concat([enriched_dataset, rolling_mean_data], axis=1, verify_integrity=True)
    
    # Acceleration magnitude
    enriched_dataset['acceleration_magnitude'] = np.sqrt(enriched_dataset['acceleration_x']**2 + \
                                                         enriched_dataset['acceleration_y']**2 + \
                                                         enriched_dataset['acceleration_z']**2) 
    
    # Current acceleration vs gravity diff
    for col in accel_cols:
        enriched_dataset[col+'_gravity_diff'] = enriched_dataset[col] - enriched_dataset[col+'_gravity']
    enriched_dataset['acceleration_gravity_diff_magnitude'] = np.sqrt(enriched_dataset['acceleration_x_gravity_diff']**2 + \
                                                                      enriched_dataset['acceleration_y_gravity_diff']**2 + \
                                                                      enriched_dataset['acceleration_z_gravity_diff']**2) 
    
    # Acceleration magnitude standard deviation
    agg_std = enriched_dataset.groupby('bookingID')['acceleration_magnitude', 'acceleration_gravity_diff_magnitude'].std().reset_index()
    agg_std.columns = ['bookingID', 'acceleration_std', 'acceleration_gravity_diff_std']
    enriched_dataset = pd.merge(enriched_dataset, agg_std, how='left', on='bookingID', validate='m:1')
    
    # Phone orientation
    enriched_dataset['orientation_theta'] = np.arctan(enriched_dataset.acceleration_x_gravity / \
        np.sqrt(enriched_dataset.acceleration_y_gravity**2 + enriched_dataset.acceleration_z_gravity**2)) / np.pi * 360
    enriched_dataset['orientation_psi'] = np.arctan(enriched_dataset.acceleration_y_gravity / \
        np.sqrt(enriched_dataset.acceleration_x_gravity**2 + enriched_dataset.acceleration_z_gravity**2)) / np.pi * 360
    enriched_dataset['orientation_phi'] = np.arctan( np.sqrt(enriched_dataset.acceleration_x_gravity**2 + enriched_dataset.acceleration_y_gravity**2) / \
        enriched_dataset.acceleration_z_gravity ) / np.pi * 360
    
    return enriched_dataset

In [None]:
# accel_data_enrich(train_dataset).head()

### Sequence Difference Data

In [None]:
def diff_data_enrich(dataset):
    enriched_dataset = dataset.copy(deep=False)
    enriched_dataset = ensure_sorted(enriched_dataset)
    
    # Construct diff
    diff_data = enriched_dataset.groupby('bookingID')['second','Bearing','Speed'].diff()
    diff_data = diff_data.rename(columns = lambda x: x + '_diff')
    
    # Modify Bearing diff to -180 to 180 
    diff_data.Bearing_diff = diff_data.Bearing_diff
    diff_data.Bearing_diff[diff_data.Bearing_diff < -180.0] += 180
    diff_data.Bearing_diff[diff_data.Bearing_diff > 180.0] -= 180

    # Difference / second (normalization)
    diff_data['Bearing_dps'] = diff_data['Bearing_diff'] / diff_data['second_diff']
    diff_data['Speed_dps'] = diff_data['Speed_diff'] / diff_data['second_diff']
    
    # Combine
    diff_data = diff_data.fillna(0)
    enriched_dataset = pd.concat([enriched_dataset, diff_data], axis=1, verify_integrity=True)
    
    # Combine accuracy of two sequence
    acc_sum = enriched_dataset.groupby('bookingID')['Accuracy']\
       .rolling(window=2, min_periods=1).sum().reset_index(drop=True).tolist()
    enriched_dataset['Accuracy_sum'] = acc_sum
    
    return enriched_dataset

In [None]:
# diff_data_enrich(train_dataset)

In [None]:
def preprocess(dataset):
    dataset = gyro_data_enrich(dataset)
    dataset = accel_data_enrich(dataset, smoothing=5)
    dataset = diff_data_enrich(dataset)
    return dataset

# Analysis

In [None]:
train_dataset_prep = preprocess(train_dataset)

## Charting

In [None]:
def chart_trip(dataset, booking_id):
    booking_id_data = dataset.loc[dataset.bookingID==booking_id,:].sort_values(by='second')
    
    plt.figure(figsize=(15,10))
    plt.subplots_adjust(hspace = .001)
    
    # Acceleration
    booking_id_acc = booking_id_data[
        ['second','acceleration_x', 'acceleration_y','acceleration_z', 'acceleration_magnitude']
    ].melt(id_vars=["second"], var_name="axis", value_name="value")
    ax1 = plt.subplot('311')
    
    plt.title("Measured data for booking ID: {}".format(booking_id))
    sns.lineplot(x="second", y="value", hue='axis', data=booking_id_acc, ax=ax1, marker="o");
    
    # Gyroscope
    booking_id_gyro = booking_id_data[
        ['second', 'gyro_x', 'gyro_y', 'gyro_z']
    ].melt(id_vars=["second"], var_name="axis", value_name="value")
    ax2 = plt.subplot('312')
    sns.lineplot(x="second", y="value", hue='axis', data=booking_id_gyro, ax=ax2, markers=True, marker="o");
    
    # Speed
    booking_id_speed = booking_id_data[
        ['second', 'Speed', 'Accuracy']
    ].melt(id_vars=["second"], var_name="type", value_name="value")
    ax3 = plt.subplot('313')
    sns.lineplot(x='second', y="value", hue='type', data=booking_id_speed, ax=ax3, markers=True, marker="o")

In [None]:
chart_trip(train_dataset_prep, 1477468749954)

### Sample of non-safe trip

In [None]:
samp = train_label.bookingID[train_label.label == 1].sample(5, random_state=1)
for id in samp:
    chart_trip(train_dataset_prep, id)

### Sample of safe trip

In [None]:
samp = train_label.bookingID[train_label.label == 0].sample(5, random_state=1)
for id in samp:
    chart_trip(train_dataset_prep, id)

## Correlation

In [None]:
analytics_features = ['gyro_filtered_magnitude',
                      'acceleration_magnitude',
                      'Speed',
                      'Bearing_dps',
                      'Speed_dps',
                      'second_diff',
                      'second',
                      'Accuracy_sum',
                      'acceleration_x', 
                      'acceleration_y', 
                      'acceleration_z',
                      'acceleration_x_gravity_diff',
                      'acceleration_y_gravity_diff',
                      'acceleration_z_gravity_diff',
                      'acceleration_gravity_diff_magnitude',
                      'gyro_x_filtered',
                      'gyro_y_filtered',
                      'gyro_z_filtered']

### Mean correlation

In [None]:
analytics_mean_corr = train_dataset_prep.groupby('bookingID')[analytics_features].mean().reset_index()
analytics_mean_corr = pd.merge(analytics_mean_corr, train_label, on='bookingID')
analytics_mean_corr = analytics_mean_corr.corr()

plt.figure( figsize=(15,5) )
ax1 = plt.subplot2grid((1, 4), (0, 0), colspan=2)
ax2 = plt.subplot2grid((1, 4), (0, 3), colspan=1)
sns.heatmap(analytics_mean_corr, ax=ax1)
sns.heatmap(pd.DataFrame(analytics_mean_corr.loc[analytics_mean_corr.index != 'label','label']), annot=True, ax=ax2);

This correlation show that mean / average data is less effective to determine a trip is save or unsafe. The unsafe tracking event maybe only recorded 1 time or maybe < 5% of the trip. But, acceleration and gyroscope magnitude data stand out here. How many % of the trips where the driver consistently drive unsafely?

### Max correlation

In [None]:
analytics_max_corr = train_dataset_prep.groupby('bookingID')[analytics_features].max().reset_index()
analytics_max_corr = pd.merge(analytics_max_corr, train_label, on='bookingID')
analytics_max_corr = analytics_max_corr.corr()

plt.figure( figsize=(15,5) )
ax1 = plt.subplot2grid((1, 4), (0, 0), colspan=2)
ax2 = plt.subplot2grid((1, 4), (0, 3), colspan=1)
sns.heatmap(analytics_max_corr, ax=ax1)
sns.heatmap(pd.DataFrame(analytics_max_corr.loc[analytics_max_corr.index != 'label','label']), annot=True, ax=ax2);

### Standard deviation correlation

In [None]:
analytics_std_corr = train_dataset_prep.groupby('bookingID')[analytics_features].std().reset_index()
analytics_std_corr = pd.merge(analytics_std_corr, train_label, on='bookingID')
analytics_std_corr = analytics_std_corr.corr()

plt.figure( figsize=(15,5) )
ax1 = plt.subplot2grid((1, 4), (0, 0), colspan=2)
ax2 = plt.subplot2grid((1, 4), (0, 3), colspan=1)
sns.heatmap(analytics_std_corr, ax=ax1)
sns.heatmap(pd.DataFrame(analytics_std_corr.loc[analytics_std_corr.index != 'label','label']), annot=True, ax=ax2);

Gyro and acceleration standard deviation is highly correlated. The hypothesis is when there is angular velocity (gyroscope reading), the phone orientation is changing. Hence, the direction of gravity relative to the phone is changing and the reading for each accelerometer axis is changing too. 

Because it is highly correlated, we will just use subset of it:
- gyroscope filter magnitude's standard deviation
- acceleration gravity diff magnitude's standard deviation

## Others

### Why accelerometer and gyroscope magnitude have high correlation with label?

In [None]:
acc_gyro_mean = train_dataset_prep.groupby('bookingID')[
    'acceleration_gravity_diff_magnitude', 
    'gyro_filtered_magnitude'
].mean().reset_index()

acc_gyro_mean = pd.merge(acc_gyro_mean, train_label, on='bookingID', validate='1:1')

In [None]:
plt.figure( figsize=(15,6))

ax1 = plt.subplot('121')
sns.scatterplot(x="acceleration_gravity_diff_magnitude", 
                y="gyro_filtered_magnitude", 
                hue="label", 
                data=acc_gyro_mean.sample(1000, random_state=1),
                ax=ax1);

ax2 = plt.subplot('122')
sns.scatterplot(x="acceleration_gravity_diff_magnitude", 
                y="gyro_filtered_magnitude", 
                hue="label", 
                data=acc_gyro_mean.sample(500, random_state=1),
                ax=ax2);
plt.xlim(0, 2);
plt.ylim(0, 0.5);

### Could we cluster the trips?

In [None]:
def aggregate_data(preprocessed_dataset):
    features_max = ['gyro_filtered_magnitude',
                    'acceleration_magnitude',
                    'Speed',
                    'Bearing_dps',
                    'Speed_dps',
                    'Accuracy_sum',
                    'second',
                    'sequence',
                    'acceleration_x_gravity_diff',
                    'acceleration_y_gravity_diff',
                    'acceleration_z_gravity_diff',
                    'acceleration_gravity_diff_magnitude',
                    'gyro_x_filtered',
                    'gyro_y_filtered',
                    'gyro_z_filtered']
    
    agg_max = preprocessed_dataset.groupby('bookingID')[features_max].max().reset_index()
    agg_max.columns = ['bookingID'] +  (pd.Series(features_max) + '_max').to_list()

    features_std = ['gyro_filtered_magnitude', 'acceleration_gravity_diff_magnitude']
    agg_std = preprocessed_dataset.groupby('bookingID')[features_std].std().reset_index()
    agg_std.columns = ['bookingID'] +  (pd.Series(features_std) + '_std').to_list()
    
    agg_data = pd.merge(agg_max, agg_std, on='bookingID', validate='1:1')
    agg_data['second_sequence_ratio'] = agg_data['second_max'] / agg_data['sequence_max'].astype(float)
    return agg_data

In [None]:
train_agg_data = aggregate_data(train_dataset_prep)
train_agg_data = pd.merge(train_agg_data, train_label, on='bookingID', validate='1:1')

In [None]:
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [None]:
features = train_agg_data.columns[train_agg_data.columns.str.contains("max|std")]

std_scaler = preprocessing.StandardScaler()
x = std_scaler.fit_transform(train_agg_data[features])

pca = PCA(n_components=2)
pc = pca.fit_transform(x)
pc_df = pd.DataFrame(data = pc, columns = ['pc1', 'pc2'])

In [None]:
plt.figure(figsize=(8, 8))
sns.scatterplot(x="pc1", 
                y="pc2", 
                hue="label",
                data=pd.concat([train_agg_data, pc_df], axis=1, verify_integrity=True).sample(2000));

There are **no visually standout clusters** after we reduce the features to 2-dimension. 

# Modeling
Our goal is to make the model which is capable to find the pattern of non-safe event like:
- Driver using unpopular shortcuts 
- Driver talk with other person in phone or with customers
- Driver keep seeing GPS and don't pay attention to the road. 
- Sleepy 
- Speeding
- Harsh acceleration, braking, or cornering
- Run over speed bump / hole with high speed.

There are 3 rough idea to approach this problem:
1. Learning on the summary of a trip
2. Learning directly from the trip sequence with reccurent model. Assume it is like NLP sentiment classification with recurrent neural network. 
3. Stacking two models. First model to detect non-safe event. Second model to summarize it.  

## Evaluation

In [None]:
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import roc_curve

def combine_pred_label(prediction_df, label_df):
    """Combine two DataFrame, each DataFrame should contains 'bookingID' column."""
    return pd.merge(prediction_df, test_label, how='left', on='bookingID', validate='1:1')

def plot_roc(prediction_df, label_df):
    """Return ROC plot given prediction and label DataFrame. Both should have 'bookingID' column."""
    pred_label_df = combine_pred_label(prediction_df, label_df)
    
    fpr, tpr, thresholds = roc_curve(pred_label_df.label, pred_label_df.prediction)
    roc_auc = auc(fpr, tpr)

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
def evaluate(prediction_df, label_df):
    """Return AUC evaluation given prediction and label DataFrame. Both should have 'bookingID' column."""
    pred_label_df = combine_pred_label(prediction_df, label_df)
    return roc_auc_score(pred_label_df.label, pred_label_df.prediction)
                         
def generate_second_dataset(dataset, prediciton, n_column=5):
    """
    Generate per booking secondary dataset from per event unsafeness prediction.
    Return top n_columns unsafeness for each bookingID
    
    Parameters:
    ----------
    dataset -- DataFrame contains bookingID
    prediction -- List / Series with length equal to dataset # rows. Each indicates unsafeness. 
    n_column -- number of column generated
    """
    sec_data = pd.DataFrame(data={'bookingID':dataset.bookingID, 'row_prob': prediciton})
    sec_data['rank'] = sec_data.groupby('bookingID').rank(ascending=False, method='first')
    sec_data = sec_data.loc[sec_data['rank'] <= n_column, :]
    sec_data = pd.pivot_table(data=sec_data, 
                              values='row_prob', 
                              index='bookingID', 
                              columns='rank', 
                              fill_value=0).reset_index()
    sec_data.columns=['bookingID'] + ['val_' + str(i) for i in range(1, (n_column + 1))]
    return sec_data

## Baseline: Random

In [None]:
import random

prediction_df = pd.DataFrame({
    'bookingID': test_dataset.bookingID.unique(),
    'prediction': np.random.random(size=test_dataset.bookingID.unique().shape[0])
})

In [None]:
evaluate(prediction_df, test_label)
plot_roc(prediction_df, test_label)

## Random Forest - Aggregated Data

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_dataset_prep = preprocess(train_dataset)

In [None]:
train_agg_data = aggregate_data(train_dataset_prep)
train_agg_data = pd.merge(train_agg_data, train_label, on='bookingID', validate='1:1')

test_dataset_prep = preprocess(test_dataset)
test_agg_data = aggregate_data(test_dataset_prep)

features = train_agg_data.columns[train_agg_data.columns.str.contains("max|std|ratio")]

In [None]:
cls = RandomForestClassifier(n_estimators=100, random_state=0, min_samples_leaf=75)
cls.fit(train_agg_data[features], train_agg_data.label)
pred = cls.predict_proba(test_agg_data[features])
pred = pred[:,np.argwhere(cls.classes_==1)[0][0]]
prediction_df = pd.DataFrame(data={'bookingID':test_agg_data.bookingID, 'prediction': pred})
print('AUC:',evaluate(prediction_df, test_label))
plot_roc(prediction_df, test_label);

In [None]:
sns.distplot(pred);

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score
temp = combine_pred_label(prediction_df=prediction_df, label_df=test_label)
temp['pred'] = (temp.prediction >= 0.66).astype(int)
pd.crosstab(temp.label, temp.pred)
recall_score(temp.label, temp.pred)
precision_score(temp.label, temp.pred)
f1_score(temp.label, temp.pred)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_dataset_prep = preprocess(train_dataset)
train_dataset_prep = pd.merge(train_dataset_prep, train_label, on='bookingID', validate='m:1')
test_dataset_prep = preprocess(test_dataset)
test_dataset_prep = pd.merge(test_dataset_prep, test_label, on='bookingID', validate='m:1')

In [None]:
features = ['Accuracy', 'Bearing', 'acceleration_x', 'acceleration_y', 'acceleration_z', 'gyro_x_filtered', 
            'gyro_y_filtered', 'gyro_z_filtered', 'Speed', 'gyro_filtered_magnitude', 'gyro_filtered_std',
            'acceleration_x_gravity', 'acceleration_y_gravity', 'acceleration_z_gravity', 'acceleration_magnitude',
            'orientation_theta', 'orientation_psi', 'orientation_phi', 'Bearing_dps', 'Speed_dps', 'Accuracy_sum']

# features = ['Accuracy', 'Bearing', 'acceleration_x', 'acceleration_y', 'acceleration_z', 'Speed']

In [None]:
cls = RandomForestClassifier(n_estimators=10, random_state=0, min_samples_leaf=500)
cls.fit(train_dataset_prep[features], train_dataset_prep.label)

In [None]:
# Comment this
y_pred = cls.predict_proba(test_dataset_prep[features])
y_pred = y_pred[:,np.argwhere(cls.classes_==1)[0][0]]
roc_auc_score(test_dataset_prep.label, y_pred)

### Stacking with Logistic Regression

In [None]:
train_pred_first = cls.predict_proba(train_dataset_prep[features])
train_pred_first = train_pred_first[:,np.argwhere(cls.classes_==1)[0][0]]

test_pred_first = cls.predict_proba(test_dataset_prep[features])
test_pred_first = test_pred_first[:,np.argwhere(cls.classes_==1)[0][0]]

In [None]:
from sklearn.linear_model import LogisticRegression

train_sec = generate_second_dataset(train_dataset_prep, train_pred_first, n_column=5)
train_sec = pd.merge(train_sec, train_label, on='bookingID')
sec_features = train_sec.columns[train_sec.columns.str.contains('val')]
sec_reg = LogisticRegression(random_state=0, penalty='l2', C=0.1)
sec_reg.fit(train_sec[sec_features], train_sec.label)


test_sec = generate_second_dataset(test_dataset_prep, test_pred_first, n_column=5)
test_sec = pd.merge(test_sec, test_label, on='bookingID')
y_pred = sec_reg.predict_proba(test_sec[sec_features])
y_pred = y_pred[:,np.argwhere(cls.classes_==1)[0][0]]
prediction_df = pd.DataFrame(data={'bookingID':test_sec.bookingID, 'prediction': y_pred})


evaluate(prediction_df, test_label)

In [None]:
plot_roc(prediction_df, test_label)

## LightGBM

In [None]:
import lightgbm as lgb

In [None]:
train_dataset_prep = preprocess(train_dataset)
train_dataset_prep = pd.merge(train_dataset_prep, train_label, on='bookingID', validate='m:1')
test_dataset_prep = preprocess(test_dataset)
test_dataset_prep = pd.merge(test_dataset_prep, test_label, on='bookingID', validate='m:1')

In [None]:
train_dataset_prep.columns

In [None]:
# features = ['Accuracy', 'acceleration_x', 'acceleration_y', 'acceleration_z', 'Speed', 'gyro_x_filtered', 
#             'gyro_y_filtered', 'gyro_z_filtered', 'gyro_filtered_magnitude', 'gyro_filtered_std',
#             'acceleration_x_gravity', 'acceleration_y_gravity', 'acceleration_z_gravity', 'acceleration_magnitude',
#             'acceleration_x_gravity_diff', 'acceleration_y_gravity_diff', 'acceleration_z_gravity_diff', 
#             'acceleration_z_gravity_diff', 'acceleration_gravity_diff_magnitude', 'acceleration_std',
#             'acceleration_gravity_diff_magnitude_std', 'Bearing_dps', 'Speed_dps', 'Accuracy_sum', 'second_diff']

features = ['Accuracy', 'gyro_x_filtered', 'gyro_y_filtered', 'gyro_z_filtered', 'Speed', 'gyro_filtered_magnitude', 
            'acceleration_magnitude', 'Bearing_dps', 'Speed_dps', 'Accuracy_sum', 
            'acceleration_x_gravity_diff', 'acceleration_y_gravity_diff', 'acceleration_z_gravity_diff', 
            'acceleration_gravity_diff_magnitude', 'orientation_theta', 
            'orientation_psi', 'orientation_phi', 'second'] + \
            ['acceleration_gravity_diff_std','acceleration_std','gyro_filtered_std']

# features = ['Accuracy', 'Bearing', 'acceleration_x', 'acceleration_y', 
#             'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z', 'Speed', 'second']

In [None]:
# from sklearn.model_selection import GridSearchCV
# import lightgbm as lgb

# gridParams = {
#     # Efficiency
#     "num_leaves": [70], # more for accuracy, beware overfitting, max to 2^(max_depth)
#     "min_data_in_leaf": [150],
#     "max_depth": [6], 
#     "n_estimators": [100],
#     # Speed
# #     "bagging_fraction": [1.0, 0.9],
# #     "bagging_freq": [5],
# #     "feature_fraction": [0.8, 0.9, 1.0],
# #     "subsample":[1.0, 0.8],
#     "max_bin": [125], # more bin for accuration,
#     # Accuracy
#     "boosting_type": ['dart'],
# #     # Overfit
#     "lambda_l1": [0.4],
#     # Others
#     "learning_rate": [0.05, 0.1, 0.15]
# }

# cslf = lgb.LGBMClassifier(boosting_type='gbdt', 
#                           objective='binary', 
#                           max_depth=6, 
#                           learning_rate=0.1,
#                           n_estimators=100,
#                           num_leaves=70,  
#                           metric='auc',
#                           random_state=10)

# grid_search = GridSearchCV(cslf, gridParams, n_jobs=4, verbose=2, return_train_score=False)
# grid_search.fit(train_dataset_prep[features], train_dataset_prep.label)

# best_parameters = grid_search.best_estimator_.get_params()
# best_parameters

# cv_result = pd.DataFrame(grid_search.cv_results_)
# cv_result = cv_result.loc[:,('params', 'rank_test_score', 'mean_test_score')]
# with pd.option_context('display.max_colwidth', -1):
#     cv_result.sort_values('rank_test_score')

In [None]:
cls = lgb.LGBMClassifier(boosting_type='dart', 
                         objective='binary', 
                         max_depth=6, 
                         n_estimator=100,
                         learning_rate=0.1, 
                         max_bin=100, 
                         num_leaves=70, 
                         lambda_l1=0.4,
                         min_data_in_leaf=150,
                         metric='auc')

cls.fit(train_dataset_prep[features], train_dataset_prep.label)

In [None]:
# Comment this
y_pred = cls.predict_proba(test_dataset_prep[features])
y_pred = y_pred[:,np.argwhere(cls.classes_==1)[0][0]]
roc_auc_score(test_dataset_prep.label, y_pred)

### Stacking with Logistic Regression

In [None]:
train_pred_first = cls.predict_proba(train_dataset_prep[features])
train_pred_first = train_pred_first[:,np.argwhere(cls.classes_==1)[0][0]]

test_pred_first = cls.predict_proba(test_dataset_prep[features])
test_pred_first = test_pred_first[:,np.argwhere(cls.classes_==1)[0][0]]

In [None]:
from sklearn.linear_model import LogisticRegression

train_sec = generate_second_dataset(train_dataset_prep, train_pred_first, n_column=5)
train_sec = pd.merge(train_sec, train_label, on='bookingID')
sec_features = train_sec.columns[train_sec.columns.str.contains('val')]
sec_reg = LogisticRegression(random_state=0, penalty='l2', C=0.1)
sec_reg.fit(train_sec[sec_features], train_sec.label)

test_sec = generate_second_dataset(test_dataset_prep, test_pred_first, n_column=5)
test_sec = pd.merge(test_sec, test_label, on='bookingID')
y_pred = sec_reg.predict_proba(test_sec[sec_features])
y_pred = y_pred[:,np.argwhere(cls.classes_==1)[0][0]]
prediction_df = pd.DataFrame(data={'bookingID':test_sec.bookingID, 'prediction': y_pred})

evaluate(prediction_df, test_label)

In [None]:
plot_roc(prediction_df, test_label)

In [None]:
sns.distplot(prediction_df.prediction)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score
temp = combine_pred_label(prediction_df=prediction_df, label_df=test_label)
temp['pred'] = (temp.prediction >= 0.6).astype(int)
pd.crosstab(temp.label, temp.pred)
recall_score(temp.label, temp.pred)
precision_score(temp.label, temp.pred)
f1_score(temp.label, temp.pred)