## EY Datawave Challenge Code

**Simple rule**: 
- all "df_xx" types are pd.DataFrame
- "xx_data" are usually NumPy arrays

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import normalize

pd.set_option('display.max_columns', None)

# Read the training Data

df is training data + label

In [2]:
#read training data
df = pd.read_csv("/Users/synch/Desktop/EY/data_train.csv", low_memory=False) #nrows = integer
df = df.loc[:,'hash':'y_exit']
df.fillna('', inplace=True)

# Read the test Data

df_test is test data + label

In [3]:
#read test data
df_test = pd.read_csv("/Users/synch/Desktop/EY/data_test.csv", low_memory=False)
df_test = df_test.loc[:,'hash':'y_exit']
df_test.fillna('', inplace=True)

# Pred and Eval

First, change the time variables into pd.timedelta.

Then, divide the df_test into 2 subparts:

1. df_test_eval: Used for evaluating test accuracy (where x_exit value exists)
2. df_test_pred: the data we want to predict from (where x_exit value do not exist)

In [4]:
#divide the test data into pred and eval
df_test['time_entry']=pd.to_timedelta(df_test['time_entry'])
df_test['time_exit']=pd.to_timedelta(df_test['time_exit'])

df_test_pred = df_test[df_test['x_exit'] == ""]
df_test_eval = df_test[df_test['x_exit'] != ""]

In [5]:
#debugging
print(df_test_pred)
print(df_test_eval)

                                       hash  \
4       00032f51796fd5437b238e3a9823d13d_31   
7       000479418b5561ab694a2870cc04fd43_25   
10      000506a39775e5bca661ac80e3f466eb_29   
13       0005401ceddaf27a9b7f0d42ef1fbe95_1   
17       00063a4f6c12e1e4de7d876580620667_3   
23       0006535be25bb52dd06983447880c964_5   
27      0006f84bb33ec929d1cda7686f861d0a_31   
31      00093ae562586aed0e053b8431e8ace4_23   
34      000c739e444a70e1804d757a0580caaa_31   
39      000d479078af08618bddc7f09082b8c3_11   
45      000d6fba12a7d06defe90e3c9162c11e_29   
56       000efd6f95db39d6ba35027f02b5628f_9   
63      000feace3a33be9245e2783cdff467a8_15   
66       0011b48d0cf4f962833edd701afaf419_3   
69      001484cf0f956c316070be4b4c6352ff_29   
72       001d29060cfe35996cfeb141210ecbeb_9   
77       001dff40eab7eea1c10d0fadcea0c779_9   
79      001f9345a2979c553125c65499630f25_29   
87      002447d99e8f6c3bc6c1329140979a91_25   
94       002531dea45cd4059e92099630ebf775_3   
100     00265

# Prepare the Training Data

choose the features: 

*for rn, I cannot handle velocity since there are NaN.*

Change the time values into float, by dividing into minutes.

Finally, store train_data as NumPy arrays, and normalize them.

In [6]:
#prepare training data
df_train = df.loc[:,['time_entry','time_exit','x_entry', 'y_entry']]
# df_train = df.loc[:,'time_entry':'y_entry']
df_train['time_entry']=pd.to_timedelta(df_train['time_entry'])/pd.offsets.Minute(1)
df_train['time_exit']=pd.to_timedelta(df_train['time_exit'])/pd.offsets.Minute(1)
train_data = df_train.values
train_data = normalize(train_data)

# prepare Train Labels

prepare the label for training:

x_exit and y_exit values have to be within certain range. Do each of the comparison and store the value as 0 or 1 in train_label NumPy array.

In [7]:
#prepare training label
df_target = df.loc[:,['x_exit','y_exit']]

target_x1 = df_target['x_exit']>=3750901.5068
target_x2 = df_target['x_exit']<=3770901.5068

target_x = target_x1 & target_x2

target_y1 = df_target['y_exit'] >= -19268905.6133
target_y2 = df_target['y_exit'] <= -19208905.6133

target_y = target_y1 & target_y2

train_label = 1*(target_x & target_y)

train_label = train_label.values

# define Keras NN model

binary softmax, but categorical_crossentropy loss. *can improve loss, optimizer, layer*

In [8]:
#define model
model = keras.Sequential([
    keras.layers.Flatten(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(16, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])

#complile the model
model.compile(optimizer='adagrad',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

#fit the model
model.fit(train_data, train_label, epochs=1)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x14dfd5cf8>

# Prepare the Test Data

choose the features: 

*for rn, I cannot handle velocity since there are NaN.*

Change the time values into float, by dividing into minutes.

Finally, store test_data_pred and test_data_eval as NumPy arrays, and normalize them.

In [9]:
#prepare test data **for rn, we don't care about velocity since there are so many nan

test_data_pred = df_test_pred.loc[:,['time_entry','time_exit','x_entry', 'y_entry']]
test_data_eval = df_test_eval.loc[:,['time_entry','time_exit','x_entry', 'y_entry']]
# test_data = df_test.loc[:,'time_entry':'y_entry']


test_data_pred['time_entry']=test_data_pred['time_entry']/pd.offsets.Minute(1)
test_data_pred['time_exit']=test_data_pred['time_exit']/pd.offsets.Minute(1)

test_data_eval['time_entry']=test_data_eval['time_entry']/pd.offsets.Minute(1)
test_data_eval['time_exit']=test_data_eval['time_exit']/pd.offsets.Minute(1)

test_data_pred = test_data_pred.values
test_data_pred = normalize(test_data_pred)

test_data_eval = test_data_eval.values
test_data_eval = normalize(test_data_eval)

# prepare Test Labels

prepare the label for test:

x_exit and y_exit values from df_test_eval (not pred). 0 or 1 in train_label NumPy array.

In [10]:
#prepare test label
df_test_target = df_test_eval.loc[:,['x_exit','y_exit']]

test_target_x1 = df_test_target['x_exit'].astype(float) >= 3750901.5068
test_target_x2 = df_test_target['x_exit'].astype(float) <= 3770901.5068

test_target_x = test_target_x1 & test_target_x2

test_target_y1 = df_test_target['y_exit'].astype(float) >= -19268905.6133
test_target_y2 = df_test_target['y_exit'].astype(float) <= -19208905.6133

test_target_y = test_target_y1 & test_target_y2

test_label = 1*(test_target_x & test_target_y)

test_label = test_label.values

# Evaluation of the Model

print the summary and test accuracy

In [11]:
#evaluate the accuracy of the model
model.summary()

test_loss, test_acc = model.evaluate(test_data_eval, test_label)

print('Test accuracy:', test_acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  320       
_________________________________________________________________
dense_1 (Dense)              multiple                  1040      
_________________________________________________________________
dense_2 (Dense)              multiple                  34        
Total params: 1,394
Trainable params: 1,394
Non-trainable params: 0
_________________________________________________________________
Test accuracy: 0.5451122050272055


# Predict the Data

Predict the test_data_pred and if the p(xi) is over 0.5, save it as 1, otherwise 0. Predictions is the NumPy array saving the result. Formulate pd.DataFrame from df_testPred['trajectory_id'] and predictions ('target') so that the output DataFrame is in ['id', 'target'] format.

In [12]:
#predict and handle output
predictions = model.predict(test_data_pred)
predictions = (predictions >= 0.5) *1

id = pd.DataFrame(df_test_pred['trajectory_id'])

target = pd.DataFrame(predictions)
target.columns = ['zeros','target']
target = target['target']

output = pd.concat([id.reset_index(drop=True),target.reset_index(drop=True)], axis=1)
output.columns = ['id', 'target']
output.to_csv("/Users/synch/Desktop/EY/try_1.csv", index=False)

In [14]:
#debugging
print("test_data_pred", test_data_pred.shape)
print("predictions", predictions.shape)
print("df_test_pred", df_test_pred.shape)
print("id", id.shape)
print("target", target.shape)
print("output", output.shape)

print(output)

test_data_pred (33515, 4)
predictions (33515, 2)
df_test_pred (33515, 11)
id (33515, 1)
target (33515,)
output (33515, 2)
                                                id  target
0       traj_00032f51796fd5437b238e3a9823d13d_31_5       1
1      traj_000479418b5561ab694a2870cc04fd43_25_10       0
2       traj_000506a39775e5bca661ac80e3f466eb_29_5       0
3        traj_0005401ceddaf27a9b7f0d42ef1fbe95_1_4       0
4        traj_00063a4f6c12e1e4de7d876580620667_3_4       1
5       traj_0006535be25bb52dd06983447880c964_5_12       1
6       traj_0006f84bb33ec929d1cda7686f861d0a_31_3       0
7      traj_00093ae562586aed0e053b8431e8ace4_23_10       1
8       traj_000c739e444a70e1804d757a0580caaa_31_3       1
9       traj_000d479078af08618bddc7f09082b8c3_11_6       1
10     traj_000d6fba12a7d06defe90e3c9162c11e_29_11       0
11      traj_000efd6f95db39d6ba35027f02b5628f_9_20       0
12     traj_000feace3a33be9245e2783cdff467a8_15_12       1
13       traj_0011b48d0cf4f962833edd701afaf419_3_7  