In [1]:
import pyarrow.parquet as pq
import torch
import random
import time
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [2]:
temp = pq.read_table('./Data/cleaned_taxi_data.parquet')
temp = temp.to_pandas()

trip_info = torch.tensor(temp.values, dtype = torch.float32)

In [9]:
temp.describe()

Unnamed: 0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,...,store_and_fwd_flag_0,store_and_fwd_flag_1,payment_type_1,payment_type_2,payment_type_3,payment_type_4,tip_bin_0,tip_bin_1,tip_bin_2,tip_bin_3
count,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,...,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0
mean,1.472616,2.017087,9.89887,1.023657,0.4994395,1.966696,0.0041969,0.2999851,15.4309,2.434673,...,0.9922112,0.007788844,0.7807847,0.2155697,0.002394945,0.001250625,0.3378955,0.5871385,0.07437246,0.000593465
std,1.039033,1.671309,5.098086,1.194539,0.01673157,1.719994,0.1745916,0.002113585,6.083246,0.3988098,...,0.08791008,0.08791008,0.4137149,0.4112171,0.04887955,0.03534207,0.4729929,0.4923484,0.2623761,0.02435391
min,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.02,6.5,0.0,0.5,0.8,0.0,0.3,11.3,2.5,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.61,8.5,0.5,0.5,2.0,0.0,0.3,14.16,2.5,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2.0,2.52,12.0,2.5,0.5,2.86,0.0,0.3,17.9,2.5,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,7.0,99.9,300.0,45.5,0.5,350.0,26.2,0.3,357.3,2.75,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
# Base values
RANDOM_SEED = 123
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 64
NUM_EPOCHS = 50

In [4]:
# Normalize and Split Input Data (Train = 75% (1,707,346), Validation = 20% (455,292), Test = 5% (113,823))
torch.manual_seed(RANDOM_SEED)

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(trip_info,  [0.75, 0.2, 0.05])

trip_inf_loc = list(range(0, 5)) + list(range(6, 172))
tip_inf_loc = [172, 173, 174, 175]
x = train_dataset.dataset[:, trip_inf_loc]
y = train_dataset.dataset[:, tip_inf_loc]


In [6]:
print(x.shape)
print(y.shape)

torch.Size([2276461, 109])
torch.Size([2276461, 67])


In [7]:

class MLP(torch.nn.Module):

    def __init__(self, num_features, num_hidden_1, num_classes):
        super(MLP, self).__init__()
        
        self.num_classes = num_classes
        
        self.linear_1 = torch.nn.Linear(num_features, num_hidden_1)
        self.bn1 = torch.nn.BatchNorm1d(self.linear_1.weight.size(0))
        self.linear_out = torch.nn.Linear(num_hidden_1, num_classes)
        
    def forward(self, x):
         
        out = self.linear_1(x)
        print(out.shape)
        print(self.linear_1.weight.size(0))
        out = self.bn1(out)
        out = F.relu(out)
        logits = self.linear_out(out)
        probas = F.softmax(logits, dim=1)
        return logits, probas
    
#################################
### Model Initialization
#################################

random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

model = MLP(num_features=109,
            num_hidden_1=100,
            num_classes=67)


model = model.to(DEVICE)


In [10]:

optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

for epoch in range(NUM_EPOCHS):
    
    model.train()
    # pytorch is expecting minibatch of examples
    for i in range(y.size()[0]):
        
        ### PREPARE MINIBATCH
        features = x[i].view(-1, 109).to(DEVICE)
        targets = y[i].to(DEVICE) # This is just giving one example of y
        # We want it to be of size (batch size) x (# of classes)
        # Check examples of FashionMNIST dataset
            
        ### FORWARD AND BACK PROP
        logits, probas = model.forward(features)
        print(f'Target: {targets}')
        print(f'Logits: {logits}')
        print(f'Probabilities: {probas}')
        cost = F.cross_entropy(torch.flatten(logits), targets)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ## LOGGING
        if not i % 20:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {i:03d}/{len(x):03d} |' 
                   f' Cost: {cost:.4f}')

    # no need to build the computation graph for backprop when computing accuracy
    model.eval()
    

torch.Size([1, 100])
100


ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 100])