In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from pathlib import Path

In [2]:
PATH = Path("data")

In [3]:
list(PATH.iterdir())

[PosixPath('data/plot.tok.gt9.5000'),
 PosixPath('data/subjdata.README.1.0'),
 PosixPath('data/quote.tok.gt9.5000'),
 PosixPath('data/train.csv')]

# Load data in
Download data from here
https://www.kaggle.com/c/nyc-taxi-trip-duration/data 
or from canvas.
Change your path appropriately. 

In [4]:
path = PATH/"train.csv"
!head $path

id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982154846191406,40.767936706542969,-73.964630126953125,40.765602111816406,N,455
id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415344238281,40.738563537597656,-73.999481201171875,40.731151580810547,N,663
id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979026794433594,40.763938903808594,-74.005332946777344,40.710086822509766,N,2124
id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040283203125,40.719970703125,-74.01226806640625,40.706718444824219,N,429
id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973052978515625,40.793209075927734,-73.972923278808594,40.782520294189453,N,435
id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982856750488281,40.742195129394531,-73.992080688476562,40.749183654785156,N,443
id181

Data Description
* id - a unique identifier for each trip
* vendor_id - a code indicating the provider associated with the trip record
* pickup_datetime - date and time when the meter was engaged
* dropoff_datetime - date and time when the meter was disengaged
* passenger_count - the number of passengers in the vehicle (driver entered value)
* pickup_longitude - the longitude where the meter was engaged
* pickup_latitude - the latitude where the meter was engaged
* dropoff_longitude - the longitude where the meter was disengaged
* dropoff_latitude - the latitude where the meter was disengaged
* store_and_fwd_flag - This flag indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server (Y=store and forward; N=not a store and forward trip)
* trip_duration - duration of the trip in seconds

In [5]:
data = pd.read_csv(PATH/"train.csv", sep=',')
data.shape

(1458644, 11)

In [6]:
# taking a sample
data = data.sample(frac=0.2, replace=False, random_state=1)
data.shape

(291729, 11)

In [7]:
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
1457636,id0880738,2,2016-02-27 20:13:05,2016-02-27 20:24:37,1,-73.981728,40.7495,-73.945915,40.792061,N,692
615369,id2002545,2,2016-06-04 09:54:05,2016-06-04 10:10:35,1,-73.979088,40.771606,-73.946518,40.822655,N,990
491096,id0289724,2,2016-05-06 17:40:05,2016-05-06 17:50:52,1,-73.9897,40.738651,-73.997772,40.754051,N,647
82632,id3767649,2,2016-05-30 19:20:26,2016-05-30 19:25:04,1,-73.988441,40.723267,-73.99588,40.716717,N,278
71403,id2530846,2,2016-01-19 14:19:22,2016-01-19 14:34:03,2,-73.985733,40.752598,-73.969231,40.755997,N,881


## Transforming Y

In [8]:
# clipping
data.loc[data["trip_duration"] > 60*60, "trip_duration"] = 60*60  
data.loc[data["trip_duration"] < 60, "trip_duration"] = 60
data['log_trip_duration'] = np.log(data['trip_duration'].values)

In [9]:
def get_time_features(df):
    df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
    df['dropoff_datetime'] = pd.to_datetime(df.dropoff_datetime)
    df['pickup_weekday'] = df['pickup_datetime'].dt.weekday
    df['pickup_hour'] = df['pickup_datetime'].dt.hour
    df['dropoff_weekday'] = df['dropoff_datetime'].dt.weekday
    df['dropoff_hour'] = df['dropoff_datetime'].dt.hour
    return df

In [10]:
data = get_time_features(data)

In [11]:
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,log_trip_duration,pickup_weekday,pickup_hour,dropoff_weekday,dropoff_hour
1457636,id0880738,2,2016-02-27 20:13:05,2016-02-27 20:24:37,1,-73.981728,40.7495,-73.945915,40.792061,N,692,6.539586,5,20,5,20
615369,id2002545,2,2016-06-04 09:54:05,2016-06-04 10:10:35,1,-73.979088,40.771606,-73.946518,40.822655,N,990,6.897705,5,9,5,10
491096,id0289724,2,2016-05-06 17:40:05,2016-05-06 17:50:52,1,-73.9897,40.738651,-73.997772,40.754051,N,647,6.472346,4,17,4,17
82632,id3767649,2,2016-05-30 19:20:26,2016-05-30 19:25:04,1,-73.988441,40.723267,-73.99588,40.716717,N,278,5.627621,0,19,0,19
71403,id2530846,2,2016-01-19 14:19:22,2016-01-19 14:34:03,2,-73.985733,40.752598,-73.969231,40.755997,N,881,6.781058,1,14,1,14


In [12]:
data = data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'log_trip_duration',
        'pickup_weekday', 'pickup_hour', 'dropoff_weekday', 'dropoff_hour']]

# Train, validation split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=23)

In [15]:
print(train_data.shape, val_data.shape)

(233383, 9) (58346, 9)


In [16]:
val_data.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,log_trip_duration,pickup_weekday,pickup_hour,dropoff_weekday,dropoff_hour
1027287,-73.984573,40.739681,-73.972954,40.736752,6.200509,5,20,5,20
1076151,-73.963165,40.75771,-73.956093,40.767696,6.43615,3,14,3,15
964948,-73.782089,40.644524,-73.98587,40.759975,8.188689,3,15,3,16
1326065,-73.992416,40.753059,-73.995659,40.764355,6.095825,2,19,2,19
1345325,-73.955009,40.786049,-73.982101,40.74395,6.958448,6,13,6,14


In [17]:
cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'log_trip_duration',
        'pickup_weekday', 'pickup_hour', 'dropoff_weekday', 'dropoff_hour']

In [18]:
train_data.pickup_hour.unique()

array([22, 19, 20, 14, 10, 15, 23, 21,  0, 11,  8,  6, 18, 12, 16, 13,  7,
       17,  2,  1,  4,  9,  5,  3])

## Dataset

In [19]:
# standarize star rating
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
train_data_cont = train_data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
train_data_cat = train_data[['pickup_weekday', 'pickup_hour', 'dropoff_weekday', 'dropoff_hour']]
train_data_y = train_data[['log_trip_duration']]

In [21]:
val_data_cont = val_data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
val_data_cat = val_data[['pickup_weekday', 'pickup_hour', 'dropoff_weekday', 'dropoff_hour']]
val_data_y = val_data[['log_trip_duration']]

In [22]:
scaler.fit(train_data_cont.values)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
class TaxiDataset(Dataset):
    def __init__(self, df_cont, df_cat, y, scaler):
        self.df_cat = df_cat.copy()
        self.df_cont = df_cont.copy()
        self.df_cont = scaler.transform(self.df_cont.values)
        self.y = y['log_trip_duration'].values
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.df_cat.iloc[idx].values, self.df_cont[idx], self.y[idx]

In [24]:
train_ds = TaxiDataset(train_data_cont, train_data_cat, train_data_y, scaler)
val_ds = TaxiDataset(val_data_cont, val_data_cat, val_data_y, scaler)

In [25]:
train_data_cat.iloc[0].values

array([ 1, 22,  1, 22])

In [26]:
train_ds[0]

(array([ 1, 22,  1, 22]),
 array([-0.34412672,  0.22993443, -0.22406964, -0.13888053]),
 6.411818267709897)

In [27]:
val_ds[0]

(array([ 5, 20,  5, 20]),
 array([-0.24136607, -0.32657616,  0.00527577, -0.38610695]),
 6.20050917404269)

## Model
Write a simple two or three layer feed-forward model in which categorical variables are represented by embeddings.

## Training
Train your model. Use mean square error loss (mse) for training. Use mse loss and r2 metric for validation. <br>
Hint: A  `batch_size=50000` and `lr=0.01` worked for me.

In [28]:
from sklearn import metrics