## Load python libraries that will be used to run notebook

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

## Import NYC TAXI Fares data to train an ANN model to predict `fare amount`

In [5]:
path='/Users/zaahirdawood/Downloads/PYTORCH_NOTEBOOKS/Data/NYCTaxiFares.csv'
df= pd.read_csv(path)

df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


## Feature Engineering

why?

```

Raw Data is often not in a state that can be modelled without creating new features from existing ones, 
e.g like when you make a smoothie, one has fruits, vegetables alongside other condiments, and these elements must 
be processed before blending.

On the other hand, 

Models have requirements to process data in order to output a prediction. It's like a baby learning information about the world.
We could shape that information in a way that the baby can use to build their understanding (model of the world) and predict 
whether the pet in their presence them is a dog or a cat.

```

what?
(What features are we going to engineer?)

- `distance_km`
- `hour`
- `AMorPM`
- `Weekday`

how?
```
Using our input data, specifically:
```
- `pickup_longitude`, `pickup_lattitude`, `dropoff_longitude`, `dropoff_lattitude` --> `distance_km`
- `pickup_datetime` --> `Weekday`, `AMorPM` , `hour`


1.  We intend to use a `haversine distance` calculation to measure the distance in KM from the long, lat data for pick up & drop off.
https://en.wikipedia.org/wiki/Haversine_formula#:~:text=The%20haversine%20formula%20determines%20the,and%20angles%20of%20spherical%20triangles.
2.  We can extract day of the week, am or pm and the specific hour by doing some `pd.Datatime` data wrangling.

In [6]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [8]:
df['distance_km'] = haversine_distance(df,'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')

In [10]:
#convert str representation of datetime to an actual time series object in pandas
df['pickup_datetime']= pd.to_datetime(df['pickup_datetime'])
#convert UTF to EST
df['EDTdate']= df['pickup_datetime'] - pd.Timedelta(hours=4)

df['hour']= df['EDTdate'].dt.hour

df['AMorPM']= np.where(df['hour'] < 12, 'am','pm')

df['weekday']= df['EDTdate'].dt.strftime('%a')

In [13]:
# displaying the features we created
df.loc[:,['distance_km','hour','AMorPM','weekday']].head()

Unnamed: 0,distance_km,hour,AMorPM,weekday
0,2.126312,4,am,Mon
1,1.392307,11,am,Sat
2,3.326763,7,am,Sat
3,1.864129,17,pm,Sun
4,7.231321,22,pm,Fri


## More data wrangling

```
The objective at this stage is to split the categorical variables - dimensions
 
and

The continous variables - numbers

and 

The target e.g predict fare prize
```

In [14]:

catcols= [col for col in df.columns if df[col].dtype in ('object','int32')]
contcols= [col for col in df.columns if df[col].dtype in ('float64','int64') and 'fare_amount' not in col and 'fare_class' not in col]
y_col= [col for col in df.columns if 'fare_amount' in col]


In [15]:
for cat in catcols:
 df[cat]= df[cat].astype('category')

## Shape the data so the model (baby) can understand it

In [16]:
cats= np.stack([df[col].cat.codes.values for col in catcols],1)
cats= torch.tensor(cats,dtype=torch.int64)

In [17]:
conts= np.stack([df[col].values for col in contcols],1)
conts= torch.tensor(conts,dtype=torch.float)

In [18]:
y= torch.tensor(df[y_col].values,dtype=torch.float).reshape(-1,1)

### Is the shape in which we have processed the data going to make sense to the baby

In [20]:
cats.shape, conts.shape, y.shape

(torch.Size([120000, 3]), torch.Size([120000, 6]), torch.Size([120000, 1]))

In [21]:
cat_szs= [len(df[col].cat.categories) for col in catcols]
cat_szs

[24, 2, 7]

In [22]:
emb_szs= [(size,min(50,(size + 1)//2)) for size in cat_szs]
emb_szs

[(24, 12), (2, 1), (7, 4)]

In [23]:
selfembeds= nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])
selfembeds

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

## Model Recipe (Learning strategy)

In [27]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

## Parameters of the lesson (How many hours a day we will run lessons at the kindergarden, breaks, learning flexibility)

In [28]:
torch.manual_seed(33)

model= TabularModel(emb_szs,conts.shape[1],1,[400,200],p=0.4)

In [29]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=400, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=400, out_features=200, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=200, out_features=1, bias=True)
  )
)

In [30]:
criterion= nn.MSELoss()
optimizer= torch.optim.Adam(model.parameters(),lr=0.001)

In [None]:
batch_size= 60000
test_size= int(batch_size * 0.2)


# Data Requires Shuffling
cat_train= cats[:batch_size-test_size]
cat_test= cats[batch_size-test_size:batch_size]

con_train= conts[:batch_size-test_size]
con_test= conts[batch_size-test_size:batch_size]


y_train= y[:batch_size-test_size]
y_test= y[batch_size-test_size:batch_size]

In [None]:
import time
start_time = time.time()

epochs = 400
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cat_train, con_train)
    loss = torch.sqrt(criterion(y_pred, y_train)) # RMSE
    losses.append(loss)
    
    # a neat trick to save screen space:
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
losses=[tensor.detach().numpy() for tensor in losses]

plt.plot(range(epochs),losses)

In [None]:
with torch.no_grad():
 
 y_val= model(cat_test,con_test)
 
 loss= torch.sqrt(criterion(y_val,y_test))

In [None]:
loss

In [None]:
losses=[tensor.detach().numpy() for tensor in losses]
plt.plot(range(epochs),losses)

In [None]:
for i in range(100):
 diff= np.abs(y_val[i].item() - y_test[i].item())
 print(f'{i}.) PREDICTED:{y_val[i].item():7.2f} | TRUE:{y_test[i].item():7.2f} | DIFF:{diff:7.2f}')

In [None]:
torch.save(model.state_dict(), 'TaxiModel.pt')