# Starter with Neural Networks
There is almost no work with features, I only split the datetime column into 6 columns, one-hot encoded 'passenger_count', extracted order ID from 'key' and used two features from the baseline kernel. The model is flawed and not tuned at all, its only purpose was to make sure that loss goes down no matter what, hence dropout+L2+BN. I almost purposefully made a bunch of mistakes in hope that somebody publicly corrects them.

Despite all that, I achieved 3.95 MSE with 10M samples and 3.83 MSE with all data. There is plenty of work ahead, though.

In [2]:
# Initial Python environment setup...
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files we have access to

from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [3]:
#features from basic linear model kernel
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

# Loading and preprocessing data in its entirety
I managed to load and preprocess the whole dataset with pandas, but it took ~20 minutes. Again, I'm uploading it so that somebody shows how to do it correctly with, I dunno, Dask. 

In [4]:
filename = 'data/nyc-taxi/train.csv'
dfs = []
chunksize = 10 ** 6
for chunk in tqdm(pd.read_csv(filename, chunksize=chunksize)):
    #preprocessing section
    add_travel_vector_features(chunk)
    chunk = chunk.dropna(how = 'any', axis = 'rows')
    chunk = chunk[(chunk.abs_diff_longitude < 5.0) & (chunk.abs_diff_latitude < 5.0)]
    chunk = chunk[(chunk.passenger_count > 0) & (chunk.passenger_count <= 6)]
    chunk[['date','time','timezone']] = chunk['pickup_datetime'].str.split(expand=True)
    chunk[['year','month','day']] = chunk['date'].str.split('-',expand=True).astype('int64')
    chunk[['hour','minute','second']] = chunk['time'].str.split(':',expand=True).astype('int64')
    chunk['year_after_0'] = chunk['year'] - np.min(chunk['year'])
    chunk[['trash', 'order_no']] = chunk['key'].str.split('.',expand=True)
    chunk['order_no'] = chunk['order_no'].astype('int64')
    chunk = pd.concat([chunk,pd.get_dummies(chunk['passenger_count'],prefix='pass')], axis =1)
    chunk = chunk.drop(['timezone','date','time', 'pickup_datetime','trash','key','passenger_count'], axis = 1)
    #append chunk to the list
    dfs.append(chunk)

56it [13:45, 14.74s/it]


In [5]:
%%time
#concatenate all chunk in one big-ass DataFrame
train_df = pd.concat(dfs)

Wall time: 10.1 s


In [6]:
#delete the chunks as I only have 16 GB RAM
del dfs

In [7]:
train_df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,abs_diff_longitude,abs_diff_latitude,year,month,day,...,minute,second,year_after_0,order_no,pass_1,pass_2,pass_3,pass_4,pass_5,pass_6
0,4.5,-73.844311,40.721319,-73.84161,40.712278,0.002701,0.009041,2009,6,15,...,26,21,0,1,1,0,0,0,0,0
1,16.9,-74.016048,40.711303,-73.979268,40.782004,0.03678,0.070701,2010,1,5,...,52,16,1,2,1,0,0,0,0,0
2,5.7,-73.982738,40.76127,-73.991242,40.750562,0.008504,0.010708,2011,8,18,...,35,0,2,49,0,1,0,0,0,0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,0.004437,0.024949,2012,4,21,...,30,42,3,1,1,0,0,0,0,0
4,5.3,-73.968095,40.768008,-73.956655,40.783762,0.01144,0.015754,2010,3,9,...,51,0,1,135,1,0,0,0,0,0


In [8]:
train_df.shape

(55115115, 21)

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55115115 entries, 0 to 55423855
Data columns (total 21 columns):
fare_amount           float64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
abs_diff_longitude    float64
abs_diff_latitude     float64
year                  int64
month                 int64
day                   int64
hour                  int64
minute                int64
second                int64
year_after_0          int64
order_no              int64
pass_1                uint8
pass_2                uint8
pass_3                uint8
pass_4                uint8
pass_5                uint8
pass_6                uint8
dtypes: float64(7), int64(8), uint8(6)
memory usage: 6.9 GB


In [10]:
X_train = train_df.drop(['fare_amount'],axis=1)
Y_train = train_df['fare_amount']

In [11]:
del train_df

In [12]:
scaler = StandardScaler()
y_scaler = StandardScaler()

In [13]:
#scale the data so that columns have zero mean and unit variance
train = scaler.fit_transform(X_train.values)
y_train =  y_scaler.fit_transform(Y_train.values.reshape(-1,1))

In [14]:
del X_train
del Y_train

In [15]:
import keras
import tensorflow as tf

In [16]:
#some imports are unnecessary
from keras import layers
from keras.layers import Input, Dropout,Dense, Activation, BatchNormalization
from keras.models import Model, load_model
from keras.initializers import glorot_uniform
from keras.callbacks import ModelCheckpoint,  ReduceLROnPlateau
from keras.regularizers import l2
from keras.optimizers import Adam

# Model

In [17]:
model = keras.Sequential([
    keras.layers.Dense(1024,kernel_initializer = glorot_uniform(),
              kernel_regularizer = l2(1e-2)),
    keras.layers.BatchNormalization(),
    keras.layers.Activation(tf.nn.leaky_relu),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1024,kernel_initializer = glorot_uniform(),
              kernel_regularizer = l2(1e-2)),
    keras.layers.BatchNormalization(),
    keras.layers.Activation(tf.nn.leaky_relu),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1024,kernel_initializer = glorot_uniform(),
              kernel_regularizer = l2(1e-2)),
    keras.layers.BatchNormalization(),
    keras.layers.Activation(tf.nn.leaky_relu),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1024,kernel_initializer = glorot_uniform(),
              kernel_regularizer = l2(1e-2)),
    keras.layers.BatchNormalization(),
    keras.layers.Activation(tf.nn.leaky_relu),
    keras.layers.Dense(1, activation=tf.nn.leaky_relu)
])

In [18]:
model.compile(optimizer=Adam(5e-4), 
              loss='mean_squared_error')

# Callbacks

In [19]:
filepath = './model_weights/weights-improvement-55M-{epoch:02d}-{val_loss:.4f}.hdf5'
best_callback = ModelCheckpoint(filepath, 
                                save_best_only=True)
lr_sched = ReduceLROnPlateau(monitor='val_loss', factor = 0.2, patience = 5, verbose = 1)
tqdm_callback = TQDMNotebookCallback(leave_inner=True,metric_format="{name}: {value:0.5f}")

# Training

In [None]:
history = model.fit(train, y_train, 
          epochs=20,
          verbose=0,
          batch_size=2048,
          validation_split=0.0002,
          callbacks=[tqdm_callback,best_callback, lr_sched])

# Load best result

In [None]:
model.load_weights('./model_weights/weights-improvement-55M-19-0.0471.hdf5')

# Load and preprocess test data

In [None]:
test_df = pd.read_csv('test.csv')
test_df.dtypes

In [None]:
key = test_df.key
add_travel_vector_features(test_df)
test_df[['date','time','timezone']] = test_df['pickup_datetime'].str.split(expand=True)
test_df[['year','month','day']] = test_df['date'].str.split('-',expand=True).astype('int64')
test_df[['hour','minute','second']] = test_df['time'].str.split(':',expand=True).astype('int64')
test_df['year_after_0'] = test_df['year'] - np.min(test_df['year'])
test_df[['trash', 'order_no']] = test_df['key'].str.split('.',expand=True)
test_df['order_no'] = test_df['order_no'].astype('int64')
test_df = pd.concat([test_df,pd.get_dummies(test_df['passenger_count'],prefix='pass')], axis =1)
test_df = test_df.drop(['timezone','date','time', 'pickup_datetime','trash','key','passenger_count'], axis = 1)
# Predict fare_amount on the test set using our model (w) tested on the testing set.
test_df.shape

# Inference and submission

In [None]:
test = scaler.transform(test_df.values)
y_test = model.predict(test)
y_test = y_scaler.inverse_transform(y_test).reshape(-1)
# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': key, 'fare_amount': y_test},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission_100.csv', index = False)

print(os.listdir('.'))

# What's next
1. Extract better features.
2. Choose a better architecture.
3. Tune the hyperparameters.
4. Forget all that and resort to XGBoost and ensembling.