## EY Datawave Challenge Code

**Simple rule**: 
- all "df_xx" types are pd.DataFrame
- "xx_data" are usually NumPy arrays

# What is in this Version:

Before, we have been predicting by considering all the trajectories separately. Yet, this approach may be misleading in that our goal is to predict **each person's position between 15:00 ~ 16:00 PM, not other time period.** So, it may be better to group trajectories of the same person into one row.

What I used for feature here:

1. Total time elapsed
2. distance from park center (last point)
3. within the park center (last trajectories' entry)
4. within the park center (overall trajectories' exit)
5. Average Velocity
6. Average Bearing (maybe deviation angle from the straight line from starting point to park center?)
7. velocity of last trajectory


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import normalize
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import math

pd.set_option('display.max_columns', None)

# Cool point got from here

https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/

In [2]:
# # fix random seed for reproducibility
# seed = 7
# np.random.seed(seed)

# Create a Callback

In [3]:
class MyCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('acc') >= 0.9:
            print("Reached 90% acc so cancelling training!")
            self.model.stop_training = True

reach_90acc = MyCallback()

# Metric for F1

https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2

In [4]:
class F1(keras.callbacks.Callback):
    def __init__(self, val_data):
        super().__init__()
        self.validation_data = val_data
        
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print ("— val_f1: %f — val_precision: %f — val_recall %f" %(_val_f1, _val_precision, _val_recall))
        return

# Read the Data

df is training data + label

In [5]:
#read training data
raw_train = pd.read_csv("/Users/Godwithus/Desktop/EY/data_train.csv", low_memory=False) #nrows = integer
raw_train = raw_train.loc[:,'hash':'y_exit']
raw_train.fillna('', inplace=True)

#read test data
raw_test = pd.read_csv("/Users/Godwithus/Desktop/EY/data_test.csv", low_memory=False)
raw_test = raw_test.loc[:,'hash':'y_exit']
raw_test.fillna('', inplace=True)

In [8]:
#time to seconds
df_train = raw_train
df_train['time_entry_seconds'] = pd.to_timedelta(df_train['time_entry']).dt.total_seconds()
df_train['time_exit_seconds']=pd.to_timedelta(df_train['time_exit']).dt.total_seconds()

df_test = raw_test
df_test['time_entry_seconds'] = pd.to_timedelta(df_test['time_entry']).dt.total_seconds()
df_test['time_exit_seconds']=pd.to_timedelta(df_test['time_exit']).dt.total_seconds()

## Some look at the data

In [7]:
#debugging

# print (df_train.info())
# print (df_test.info())

df_train.describe()
df_train.head()

Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit
0,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,,,,3751014.0,-19093980.0,3750326.0,-19136340.0
1,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_1,07:20:34,07:25:42,,,,3743937.0,-19322470.0,3744975.0,-19319660.0
2,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_2,07:53:32,08:03:25,,,,3744868.0,-19293560.0,3744816.0,-19292840.0
3,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_3,08:17:50,08:37:23,,,,3744880.0,-19292290.0,3744809.0,-19290490.0
4,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_4,14:38:09,14:38:09,,,,3744909.0,-19285580.0,3744909.0,-19285580.0


## Group the Dataset

In particular, for each person, store the last trajectories' data in separate dataframe for later use.

In [9]:
#for train data

last_traj_train = df_train.groupby('hash').last()

# count_one_traj = df_train.groupby('hash').count()
# count_one_traj = count_one_traj[count_one_traj['trajectory_id']==1]
# one_traj = last_traj_train.loc[count_one_traj.index]

df_train = df_train.merge(last_traj_train, how='left', indicator=True)
df_train = df_train[(df_train['_merge'] == 'left_only')]

# one_traj['hash'] = one_traj.index
# one_traj.reset_index(drop = True, inplace=True)
# df_train = df_train.append(one_traj, sort=False)


In [10]:
#for test data

last_traj_test = df_test.groupby('hash').last()

# count_one_traj = df_test.groupby('hash').count()
# count_one_traj = count_one_traj[count_one_traj['trajectory_id']==1]
# one_traj = last_traj_test.loc[count_one_traj.index]

df_test = df_test.merge(last_traj_test, how='left', indicator=True)
df_test = df_test[(df_test['_merge'] == 'left_only')]

# one_traj['hash'] = one_traj.index
# one_traj.reset_index(drop = True, inplace=True)
# df_test = df_test.append(one_traj, sort=False)


# Prepare the Training Data

choose the features: 

Change the time values into float (total seconds)

Finally, store train_data as NumPy arrays, and normalize them.

Features:
1. Total time elapsed
2. within the park center (last trajectories' entry)
3. distance from park center (last point)
4. Total distance traveled  
5. Average Velocity
6. Average Bearing (maybe deviation angle from the straight line from starting point to park center?)
7. velocity of last trajectory



# Bearing Calculation Function

https://gist.github.com/jeromer/2005586

In [11]:
def calculate_initial_compass_bearing(pointA, pointB):
    """
    Calculates the bearing between two points.
    The formulae used is the following:
        θ = atan2(sin(Δlong).cos(lat2),
                  cos(lat1).sin(lat2) − sin(lat1).cos(lat2).cos(Δlong))
    :Parameters:
      - `pointA: The tuple representing the latitude/longitude for the
        first point. Latitude and longitude must be in decimal degrees
      - `pointB: The tuple representing the latitude/longitude for the
        second point. Latitude and longitude must be in decimal degrees
    :Returns:
      The bearing in degrees
    :Returns Type:
      float
    """
    if (type(pointA) != tuple) or (type(pointB) != tuple):
        raise TypeError("Only tuples are supported as arguments")

    lat1 = math.radians(pointA[0])
    lat2 = math.radians(pointB[0])

    diffLong = math.radians(pointB[1] - pointA[1])

    x = math.sin(diffLong) * math.cos(lat2)
    y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1)
            * math.cos(lat2) * math.cos(diffLong))

    initial_bearing = math.atan2(x, y)

    # Now we have the initial bearing but math.atan2 return values
    # from -180° to + 180° which is not what we want for a compass bearing
    # The solution is to normalize the initial bearing as shown below
    initial_bearing = math.degrees(initial_bearing)
    compass_bearing = (initial_bearing + 360) % 360

    return compass_bearing

In [12]:
# 0. Prepare required stats in each trajectory (seconds)

aggregation = {
    'time_entry_seconds': {'first'},
    'time_exit_seconds': {'last'},
    'x_entry' : {'first'},
    'y_entry' : {'first'},
    'x_exit' : {'last'},
    'y_exit' : {'last'}
}

df_train_traj = df_train.groupby('hash').agg(aggregation)

df_train_traj.columns = ['time_entry','time_exit','x_entry','y_entry','x_exit','y_exit']

df_train_traj.describe()

print(len(df_train.groupby('hash').count()))

132753


In [13]:
# 1. total time elapsed (seconds)

df_train_traj['total_time'] = df_train_traj['time_exit'] - df_train_traj['time_entry']

# 2. prepare whether entry point of last trajectory is in cityhall

x_in_city = (last_traj_train['x_entry'] >=3750901.5068) & (last_traj_train['x_entry']<=3770901.5068)
y_in_city = (last_traj_train['y_entry'] >= -19268905.6133) & (last_traj_train['y_entry'] <= -19208905.6133)

last_traj_train['entry_inside'] = 1*(x_in_city & y_in_city)

# 3.0 time stayed in last trajectory

last_traj_train['total_time']=last_traj_train['time_exit_seconds']-last_traj_train['time_entry_seconds']

# 3. the distance from the entry point of last trajectory from the city hall's mid point

last_traj_train['distance_from_center'] = ((3760901.5068 - last_traj_train['x_entry']).pow(2) + \
                        (-19238905.6133 - last_traj_train['y_entry']).pow(2)).pow(1/2)

# 4. total distance traveled

df_train_traj['total_travel'] = ((df_train_traj['x_exit'] - df_train_traj['x_entry']).pow(2) + \
                                 (df_train_traj['y_exit'] - df_train_traj['y_entry']).pow(2)).pow(1/2)

# distance from city hall boundaries

last_traj_train.loc[(last_traj_train['x_entry'] >=3750901.5068) & (last_traj_train['x_entry']<=3770901.5068) & (last_traj_train['y_entry'] >= -19268905.6133) & (last_traj_train['y_entry'] <= -19208905.6133), 'distance_2'] = 0
last_traj_train.loc[(last_traj_train['x_entry'] <3750901.5068) & (last_traj_train['y_entry'] >= -19268905.6133) & (last_traj_train['y_entry'] <= -19208905.6133), 'distance_2'] = 3750901.5068 - last_traj_train['x_entry']
last_traj_train.loc[(last_traj_train['x_entry']>3770901.5068) & (last_traj_train['y_entry'] >= -19268905.6133) & (last_traj_train['y_entry'] <= -19208905.6133), 'distance_2'] = last_traj_train['x_entry'] - 3770901.5068
last_traj_train.loc[(last_traj_train['x_entry'] >=3750901.5068) & (last_traj_train['x_entry']<=3770901.5068) & (last_traj_train['y_entry'] < -19268905.6133), 'distance_2'] = -19268905.6133 - last_traj_train['y_entry']
last_traj_train.loc[(last_traj_train['x_entry'] >=3750901.5068) & (last_traj_train['x_entry']<=3770901.5068) & (last_traj_train['y_entry'] > -19208905.6133), 'distance_2'] = last_traj_train['y_entry'] + 19208905.6133
last_traj_train.loc[(last_traj_train['x_entry']>3770901.5068) & (last_traj_train['y_entry'] > -19208905.6133), 'distance_2'] = ((3770901.5068 - last_traj_train['x_entry']).pow(2) + (-19208905.6133 - last_traj_train['y_entry']).pow(2)).pow(1/2)
last_traj_train.loc[(last_traj_train['x_entry'] <3750901.5068) & (last_traj_train['y_entry'] > -19208905.6133), 'distance_2'] = ((3750901.5068 - last_traj_train['x_entry']).pow(2) + (-19208905.6133 - last_traj_train['y_entry']).pow(2)).pow(1/2)
last_traj_train.loc[(last_traj_train['x_entry']>3770901.5068) & (last_traj_train['y_entry'] < -19268905.6133), 'distance_2'] = ((3770901.5068 - last_traj_train['x_entry']).pow(2) + (-19268905.6133 - last_traj_train['y_entry']).pow(2)).pow(1/2)
last_traj_train.loc[(last_traj_train['x_entry'] <3750901.5068) & (last_traj_train['y_entry'] < -19268905.6133), 'distance_2'] = ((3750901.5068 - last_traj_train['x_entry']).pow(2) + (-19268905.6133 - last_traj_train['y_entry']).pow(2)).pow(1/2)


# 5. Avg. Velocity

df_train_traj['Avg_velocity'] = df_train_traj['total_travel'] / df_train_traj['total_time']

# 6. Avg. Bearing

a = []
for i in range(len(df_train_traj['x_entry'].values)):
    a.append(calculate_initial_compass_bearing((df_train_traj['x_entry'].values[i], df_train_traj['y_entry'].values[i]) , \
                                 (df_train_traj['x_exit'].values[i],  df_train_traj['y_exit'].values[i])))

bearing = np.array(a)


df_bearing = pd.DataFrame(bearing, columns = ['bearing'])

df_bearing.index = df_train_traj.index

df_train_traj = df_train_traj.merge(df_bearing, left_index=True, right_index=True)

df_train_traj.head()


Unnamed: 0_level_0,time_entry,time_exit,x_entry,y_entry,x_exit,y_exit,total_time,total_travel,Avg_velocity,bearing
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0000a8602cf2def930488dee7cdad104_1,25471.0,52689.0,3751014.0,-19093980.0,3744909.0,-19285580.0,27218.0,191701.58092,7.043191,80.402028
0000cf177130469eeac79f67b6bcf3df_9,52163.0,53414.0,3749450.0,-19265060.0,3749042.0,-19266320.0,1251.0,1333.516162,1.06596,12.281473
0001f97b99a80f18f62e2d44e54ef33d_3,41826.0,42728.0,3771461.0,-19104130.0,3757004.0,-19296980.0,902.0,193384.920615,214.395699,74.334681
0002124248b0ca510dea42824723ccac_31,35768.0,53941.0,3765544.0,-19172270.0,3768391.0,-19202110.0,18173.0,29978.550018,1.64962,179.197578
000219c2a6380c307e8bffd85b5e404b_23,4.0,41934.0,3760336.0,-19228180.0,3763808.0,-19269950.0,41930.0,41914.389647,0.999628,192.881739


In [14]:
# 7. merge with last traj

last_data_train = last_traj_train.loc[:,['entry_inside', 'distance_from_center', 'total_time','distance_2']]

df_train_traj = df_train_traj.merge(last_data_train, on='hash', how = 'outer')

df_train_traj.fillna(0, inplace=True) #fill nan

# real total time
df_train_traj['total_time'] = df_train_traj['total_time_x'] + df_train_traj['total_time_y'] 

#random shuffle
df_train_traj = df_train_traj.sample(frac=1).reset_index()

df_train_traj

Unnamed: 0,hash,time_entry,time_exit,x_entry,y_entry,x_exit,y_exit,total_time_x,total_travel,Avg_velocity,bearing,entry_inside,distance_from_center,total_time_y,distance_2,total_time
0,8dda4dc276b11e72f8e28b6dce9b8e79_3,14434.0,46874.0,3.753419e+06,-1.927770e+07,3.749213e+06,-1.924692e+07,32440.0,31064.189542,0.957589,182.481105,1,26215.083262,0.0,0.000000,32440.0
1,718ce9a4d66c39ccc28a4603159912e4_9,8703.0,45041.0,3.746154e+06,-1.935850e+07,3.772502e+06,-1.914695e+07,36338.0,213176.354075,5.866486,339.339489,0,111702.563303,0.0,81174.654562,36338.0
2,ff82872d200d7b9932fbb0fdf3ba4d3c_23,1132.0,47416.0,3.760519e+06,-1.925016e+07,3.758048e+06,-1.924816e+07,46284.0,3177.122327,0.068644,209.078055,1,8294.254212,0.0,0.000000,46284.0
3,5ddbd0d2fa1ef46dbdf4a29c84b4abe8_23,31052.0,51046.0,3.767944e+06,-1.920914e+07,3.744752e+06,-1.928528e+07,19994.0,79590.303541,3.980709,185.292385,0,46055.451777,379.0,14506.981309,20373.0
4,59d5015441d59aaceb8af9d12a075e16_29,22802.0,41230.0,3.768098e+06,-1.907634e+07,3.768090e+06,-1.907599e+07,18428.0,355.474044,0.019290,207.043646,0,162098.962024,0.0,131930.566484,18428.0
5,3e9f6b3ac63770ce11723fa137262bca_5,26944.0,51395.0,3.770773e+06,-1.921195e+07,3.770848e+06,-1.920559e+07,24451.0,6361.082887,0.260156,90.183946,1,24185.030045,770.0,0.000000,25221.0
6,35392661f5a740c098d20c8c7a27f455_15,27590.0,53784.0,3.761854e+06,-1.911751e+07,3.762164e+06,-1.911349e+07,26194.0,4034.617577,0.154028,240.566514,0,116869.335144,540.0,86865.720569,26734.0
7,b2be3c6953367fd89f1e10572db5687c_23,49153.0,50689.0,3.743263e+06,-1.924741e+07,3.746516e+06,-1.920734e+07,1536.0,40203.245633,26.173988,100.220443,0,53677.463461,2099.0,22455.218504,3635.0
8,803c0f8a3e645368b811470a2ae58e91_11,35580.0,53177.0,3.761660e+06,-1.923918e+07,3.746382e+06,-1.928918e+07,17597.0,52282.491100,2.971103,228.311345,0,124515.109972,329.0,94010.862499,17926.0
9,028111171207f15192358be9396a80f0_15,42784.0,50457.0,3.742736e+06,-1.935613e+07,3.755559e+06,-1.923976e+07,7673.0,117074.524186,15.257986,128.870509,0,118622.105300,0.0,87604.324905,7673.0


In [35]:
#make a numpy array

train_data=df_train_traj.loc[:,['total_time_x','total_time_y','entry_inside','distance_from_center',
                                'total_travel','Avg_velocity','bearing','distance_2']].values #'total_travel','Avg_velocity','bearing',



normalized_col = normalize(train_data[:,[0,1,3,4,5,6,7]], axis = 0)

train_data = np.concatenate((train_data[:,[2]],normalized_col), axis = 1)

df_train_data = pd.DataFrame(train_data)

df_train_data.columns = ['entry_inside','total_time_x','total_time_y','distance_from_center',
                                'total_travel','Avg_velocity','bearing','distance_2']

Unnamed: 0,entry_inside,total_time_x,total_time_y,distance_from_center,total_travel,Avg_velocity,bearing,distance_2
0,1.0,0.003416,0.000000,0.000911,1.145428e-03,1.127355e-04,0.002434,0.000000
1,0.0,0.003826,0.000000,0.003884,7.860437e-03,6.906521e-04,0.004526,0.003923
2,1.0,0.004873,0.000000,0.000288,1.171498e-04,8.081359e-06,0.002789,0.000000
3,0.0,0.002105,0.001271,0.001601,2.934728e-03,4.686427e-04,0.002472,0.000701
4,0.0,0.001940,0.000000,0.005636,1.310737e-05,2.270968e-06,0.002762,0.006376
5,1.0,0.002575,0.002583,0.000841,2.345518e-04,3.062780e-05,0.001203,0.000000
6,0.0,0.002758,0.001811,0.004063,1.487682e-04,1.813351e-05,0.003209,0.004198
7,0.0,0.000162,0.007040,0.001866,1.482412e-03,3.081423e-03,0.001337,0.001085
8,0.0,0.001853,0.001103,0.004329,1.927809e-03,3.497832e-04,0.003045,0.004543
9,0.0,0.000808,0.000000,0.004124,4.316881e-03,1.796299e-03,0.001719,0.004234


# prepare Train Labels

prepare the label for training:

x_exit and y_exit values have to be within certain range. Do each of the comparison and store the value as 0 or 1 in train_label NumPy array.

In [36]:
#prepare training label

target_x = (last_traj_train['x_exit']>=3750901.5068) & (last_traj_train['x_exit']<=3770901.5068)
target_y = (last_traj_train['y_exit']>=-19268905.6133) & (last_traj_train['y_exit']<=-19208905.6133)

train_label = 1*(target_x & target_y)
df_train_data['train_label'] = train_label.values


train_label = train_label.values

# train_label = to_categorical(train_label)
df_train_data

Unnamed: 0,entry_inside,total_time_x,total_time_y,distance_from_center,total_travel,Avg_velocity,bearing,distance_2,train_label
0,1.0,0.003416,0.000000,0.000911,1.145428e-03,1.127355e-04,0.002434,0.000000,0
1,0.0,0.003826,0.000000,0.003884,7.860437e-03,6.906521e-04,0.004526,0.003923,0
2,1.0,0.004873,0.000000,0.000288,1.171498e-04,8.081359e-06,0.002789,0.000000,0
3,0.0,0.002105,0.001271,0.001601,2.934728e-03,4.686427e-04,0.002472,0.000701,0
4,0.0,0.001940,0.000000,0.005636,1.310737e-05,2.270968e-06,0.002762,0.006376,0
5,1.0,0.002575,0.002583,0.000841,2.345518e-04,3.062780e-05,0.001203,0.000000,0
6,0.0,0.002758,0.001811,0.004063,1.487682e-04,1.813351e-05,0.003209,0.004198,0
7,0.0,0.000162,0.007040,0.001866,1.482412e-03,3.081423e-03,0.001337,0.001085,1
8,0.0,0.001853,0.001103,0.004329,1.927809e-03,3.497832e-04,0.003045,0.004543,0
9,0.0,0.000808,0.000000,0.004124,4.316881e-03,1.796299e-03,0.001719,0.004234,0


# Custom F1 loss function

In [37]:
def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [38]:
print(len(train_data), len(train_label))

134063 134063


# define Keras NN model

binary softmax, but categorical_crossentropy loss. *can improve loss, optimizer, layer*

In [61]:
# train_data = train_data.reshape(134063, 8,1)
#define model
model = keras.Sequential([
    keras.layers.Flatten(),
    keras.layers.Dense(100, input_shape = (134063,8,1)),
#     keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

#complile the model
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy']) 

#fit the model
# f1 = F1((test_data_eval, test_label))
history = model.fit(train_data, train_label, epochs=10, \
                     callbacks=[reach_90acc]) #, validation_data=(test_data_eval, test_label)

Epoch 1/10

KeyboardInterrupt: 

In [40]:
pred = model.predict(train_data)

pred

array([[0.25976482],
       [0.25976482],
       [0.25976482],
       ...,
       [0.25976482],
       [0.25976482],
       [0.25976482]], dtype=float32)

# Prepare the Test Data

choose the features: 

*velocty for pred data is weired

Change the time values into float, by dividing into minutes.

Finally, store test_data_pred and test_data_eval as NumPy arrays, and normalize them.

In [20]:
# 0. Prepare required stats in each trajectory (seconds)

aggregation = {
    'time_entry_seconds': {'first'},
    'time_exit_seconds': {'last'},
    'x_entry' : {'first'},
    'y_entry' : {'first'},
    'x_exit' : {'last'},
    'y_exit' : {'last'}
}

df_test_traj = df_test.groupby('hash').agg(aggregation)

df_test_traj.columns = ['time_entry','time_exit','x_entry','y_entry','x_exit','y_exit']

df_test_traj.head()

Unnamed: 0_level_0,time_entry,time_exit,x_entry,y_entry,x_exit,y_exit
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00032f51796fd5437b238e3a9823d13d_31,42197.0,49393.0,3773413.0,-19098280.0,3773131.0,-19144650.0
000479418b5561ab694a2870cc04fd43_25,29303.0,44004.0,3771380.0,-19332740.0,3769983.0,-19342650.0
000506a39775e5bca661ac80e3f466eb_29,31505.0,40142.0,3760880.0,-19100420.0,3755349.0,-19161350.0
0005401ceddaf27a9b7f0d42ef1fbe95_1,33003.0,34659.0,3751328.0,-19162360.0,3751349.0,-19162840.0
00063a4f6c12e1e4de7d876580620667_3,31718.0,52502.0,3747364.0,-19278460.0,3766296.0,-19170290.0


In [21]:
# 1. total time elapsed (seconds)

df_test_traj['total_time'] = df_test_traj['time_exit'] - df_test_traj['time_entry']

# 2. prepare whether entry point of last trajectory is in cityhall

x_in_city = (last_traj_test['x_entry'] >=3750901.5068) & (last_traj_test['x_entry']<=3770901.5068)
y_in_city = (last_traj_test['y_entry'] >= -19268905.6133) & (last_traj_test['y_entry'] <= -19208905.6133)

last_traj_test['entry_inside'] = 1*(x_in_city & y_in_city)

# 3. the distance from the entry point of last trajectory from the city hall's mid point

last_traj_test['distance_from_center'] = ((3760901.5068 - last_traj_test['x_entry']).pow(2) + \
                        (-19238905.6133 - last_traj_test['y_entry']).pow(2)).pow(1/2)

# 3.0 time stayed in last trajectory

last_traj_test['total_time']=last_traj_test['time_exit_seconds']-last_traj_train['time_entry_seconds']

# 4. total distance traveled

df_test_traj['total_travel'] = ((df_test_traj['x_exit'] - df_test_traj['x_entry']).pow(2) + \
                                 (df_test_traj['y_exit'] - df_test_traj['y_entry']).pow(2)).pow(1/2)

# 5. Avg. Velocity

df_test_traj['Avg_velocity'] = df_test_traj['total_travel'] / df_test_traj['total_time']

df_test_traj.head()



Unnamed: 0_level_0,time_entry,time_exit,x_entry,y_entry,x_exit,y_exit,total_time,total_travel,Avg_velocity
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00032f51796fd5437b238e3a9823d13d_31,42197.0,49393.0,3773413.0,-19098280.0,3773131.0,-19144650.0,7196.0,46372.36472,6.444186
000479418b5561ab694a2870cc04fd43_25,29303.0,44004.0,3771380.0,-19332740.0,3769983.0,-19342650.0,14701.0,10012.111189,0.68105
000506a39775e5bca661ac80e3f466eb_29,31505.0,40142.0,3760880.0,-19100420.0,3755349.0,-19161350.0,8637.0,61184.374019,7.083984
0005401ceddaf27a9b7f0d42ef1fbe95_1,33003.0,34659.0,3751328.0,-19162360.0,3751349.0,-19162840.0,1656.0,484.498442,0.292572
00063a4f6c12e1e4de7d876580620667_3,31718.0,52502.0,3747364.0,-19278460.0,3766296.0,-19170290.0,20784.0,109821.492536,5.283944


In [22]:
# 6. Avg. Bearing

a = []
for i in range(len(df_test_traj['x_entry'].values)):
    a.append(calculate_initial_compass_bearing((df_test_traj['x_entry'].values[i], df_test_traj['y_entry'].values[i]) , \
                                 (df_test_traj['x_exit'].values[i],  df_test_traj['y_exit'].values[i])))

bearing = np.array(a)


df_bearing = pd.DataFrame(bearing, columns = ['bearing'])

df_bearing.index = df_test_traj.index

df_test_traj = df_test_traj.merge(df_bearing, left_index=True, right_index=True)

df_test_traj.head()



Unnamed: 0_level_0,time_entry,time_exit,x_entry,y_entry,x_exit,y_exit,total_time,total_travel,Avg_velocity,bearing
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00032f51796fd5437b238e3a9823d13d_31,42197.0,49393.0,3773413.0,-19098280.0,3773131.0,-19144650.0,7196.0,46372.36472,6.444186,61.231058
000479418b5561ab694a2870cc04fd43_25,29303.0,44004.0,3771380.0,-19332740.0,3769983.0,-19342650.0,14701.0,10012.111189,0.68105,6.510434
000506a39775e5bca661ac80e3f466eb_29,31505.0,40142.0,3760880.0,-19100420.0,3755349.0,-19161350.0,8637.0,61184.374019,7.083984,94.14922
0005401ceddaf27a9b7f0d42ef1fbe95_1,33003.0,34659.0,3751328.0,-19162360.0,3751349.0,-19162840.0,1656.0,484.498442,0.292572,134.438556
00063a4f6c12e1e4de7d876580620667_3,31718.0,52502.0,3747364.0,-19278460.0,3766296.0,-19170290.0,20784.0,109821.492536,5.283944,2.462068


In [23]:
# 7. merge with last traj

last_data_test = last_traj_test.loc[:,['entry_inside', 'distance_from_center', 'total_time']]

df_test_traj = df_test_traj.merge(last_data_test, on='hash', how = 'outer')

In [24]:
#make Numpy
df_test_traj.fillna(0, inplace=True)

df_test_traj['total_time'] = df_test_traj['total_time_x'] + df_test_traj['total_time_y']
test_data=df_test_traj.loc[:,['total_time','entry_inside','distance_from_center',
                                'total_travel','Avg_velocity','bearing']].values


test_data = normalize(test_data)

df_test_traj.describe()

Unnamed: 0,time_entry,time_exit,x_entry,y_entry,x_exit,y_exit,total_time_x,total_travel,Avg_velocity,bearing,entry_inside,distance_from_center,total_time_y,total_time
count,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0,33515.0
mean,25494.049083,46740.311711,3725760.0,-19042570.0,3726023.0,-19045670.0,21246.262629,46109.033046,6.599191,173.601493,0.284768,62611.400029,0.0,21246.262629
std,14468.322996,9505.955544,358365.0,1833042.0,358376.0,1832814.0,14801.562616,57767.432463,26.110612,110.308711,0.451311,47876.450102,0.0,14801.562616
min,0.0,0.0,0.0,-19376740.0,0.0,-19376750.0,0.0,0.0,0.0,0.0,0.0,108.826792,0.0,0.0
25%,15144.0,44759.5,3753112.0,-19287910.0,3754830.0,-19274890.0,8755.5,1851.770575,0.119105,79.070229,0.0,21366.826758,0.0,8755.5
50%,26899.0,50378.0,3759903.0,-19227200.0,3760143.0,-19229710.0,20100.0,21490.481422,1.196971,177.167462,0.0,51459.78668,0.0,20100.0
75%,35493.0,52741.0,3767997.0,-19144150.0,3767635.0,-19168610.0,30352.0,71953.648115,4.401597,265.647896,1.0,96292.817298,0.0,30352.0
max,53992.0,53999.0,3776975.0,0.0,3776983.0,0.0,53909.0,318307.314835,2792.186704,359.99915,1.0,191843.315907,0.0,53909.0


# Evaluation of the Model

print the summary and test accuracy

In [25]:
#test_data_eval = test_data_eval.reshape(169422, 7, 1)

#evaluate the accuracy of the model
model.summary()

test_loss, test_acc = model.evaluate(test_data_eval, test_label)

print('Test accuracy:', test_acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 100)               900       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 11,101
Trainable params: 11,101
Non-trainable params: 0
_________________________________________________________________


NameError: name 'test_data_eval' is not defined

# Plot the model's Learning Curve

https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/

In [None]:
# summarize history for acc
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for f1, recall, precision
plt.plot(f1.val_f1s)
plt.plot(f1.val_recalls)
plt.plot(f1.val_precisions)
plt.title('f1')
plt.ylabel('f1')
plt.xlabel('epoch')
plt.legend(['f1', 'recall','precision'], loc='upper left')
plt.show()

# Predict the Data

Predict the test_data_pred and if the p(xi) is over 0.5, save it as 1, otherwise 0. Predictions is the NumPy array saving the result. Formulate pd.DataFrame from df_testPred['trajectory_id'] and predictions ('target') so that the output DataFrame is in ['id', 'target'] format.

In [None]:
#test_data_pred = test_data_pred.reshape(33515, 7, 1)

#predict and handle output
predictions = model.predict(test_data_pred)

print(predictions)
predictions = (predictions >= 0.5) *1

id = pd.DataFrame(df_test_pred['trajectory_id'])

target = pd.DataFrame(predictions)
# target.columns = ['zeros','target']
# target = target['target']
target.columns = ['target']

output = pd.concat([id.reset_index(drop=True),target.reset_index(drop=True)], axis=1)
output.columns = ['id', 'target']
output.to_csv("/Users/Godwithus/Desktop/EY/try_1.csv", index=False)

In [None]:
#debugging
print("test_data_pred", test_data_pred.shape)
print("predictions", predictions.shape)
print("df_test_pred", df_test_pred.shape)
print("id", id.shape)
print("target", target.shape)
print("output", output.shape)

print(output)

print(target.sum()/np.size(target, 0))