In [None]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('/Users/sumanthshankar/Documents/DataSets/NYCTaxiFares.csv')

def haversine_distance_formula(lat1,long1,lat2,long2):
    r = 6371
    long1, lat1, long2, lat2 = map(np.radians,[long1,lat1,long2,lat2])
    dlong = long2-long1 
    dlat = lat2-lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlong/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    d = c * r
    return d

In [None]:
df.head()

In [None]:
df['dist_km'] = haversine_distance_formula(df['pickup_latitude'],df['pickup_longitude'],df['dropoff_latitude'],df['dropoff_longitude'])

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']);

In [None]:
df['EDTDate'] = df['pickup_datetime'].dt.tz_convert('US/Eastern')

In [None]:
df['Hour'] = df['EDTDate'].dt.hour
df['AMorPM'] = np.where(df['Hour'] < 12,'am','pm')
df['Weekday'] = df['EDTDate'].dt.strftime('%a')

In [None]:
cat_cols = ['Hour','AMorPM','Weekday']
cont_cols = ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','dist_km']
y_col = ['fare_amount']

In [None]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

In [None]:
cats = np.stack([df[col].cat.codes.values for col in cat_cols],axis=1)
cats = torch.tensor(cats, dtype=torch.int64)

In [None]:
conts = np.stack([df[col].values for col in cont_cols],axis=1)
conts = torch.tensor(conts, dtype=torch.float)

In [None]:
y = torch.tensor(df[y_col].values,dtype=torch.float)
y.shape

In [None]:
cats.shape

In [None]:
conts.shape

In [None]:
y.shape

In [None]:
cat_szs = [len(df[col].cat.categories) for col in cat_cols]
cat_szs

In [None]:
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]

In [None]:
emb_szs

In [None]:
selfembeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])

In [None]:
class TabularModel(nn.Module):
    def __init__(self,emb_szs,n_cont,out_sz,layers,p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum([nf for ni,nf in emb_szs])
        n_in = n_emb+n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
            
        layerlist.append(nn.Linear(layers[-1],out_sz))
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self,x_cat,x_cont):
        embeddings = []
        
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x,x_cont],1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(33)
model = TabularModel(emb_szs,conts.shape[1],1,[200,100],p=0.4)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

In [None]:
batch_size = 60000
test_size = int(batch_size*0.2)

In [None]:
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]

In [None]:
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [None]:
len(cat_train)

In [None]:
import time

start_time = time.time()

epochs = 1200

losses = []

for i in range(epochs):
    i += 1
    
    y_pred = model(cat_train,con_train)
    loss = torch.sqrt(criterion(y_pred,y_train))
    losses.append(loss)
    
    if i%10 == 1:
        print(f'epoch: {i} loss in {loss}')
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
duration = time.time()-start_time
print(f'Training took {duration/60} minutes')

In [None]:
%matplotlib inline
plt.plot(range(epochs),losses)


In [None]:
with torch.no_grad():
    y_val = model(cat_test,con_test)
    loss = torch.sqrt(criterion(y_val,y_test))
    
for i in range(10):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{(i+1)}.) Predicted: {y_val[i].item():3.2f} Actual: {y_test[i].item():3.2f} Difference: {diff:3.2f}')

In [None]:
torch.save(model.state_dict(), 'TaxiModel.pt')