In [64]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict

zipped_data_path = "../data/clean_data/class-competition-cleaned.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zip:
    for filename in zip.namelist():
        if filename.endswith(".csv"):
            with zip.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

FILE: train_call_type_A.csv
FILE: train_call_type_B.csv
FILE: train_call_type_C.csv
FILE: test_public.csv


In [65]:
train_data = dataframes["train_call_type_A.csv"]
test_data  = dataframes["test_public.csv"]
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
train_data.head()

cpu


Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,POLYLINE,TRAVEL_TIME,START_LOCATION,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013
0,1372637343620000571,A,31508.0,,20000571,"[[-8.618868,41.155101],[-8.6175,41.154912],[-8...",465,,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
1,1372639135620000570,A,33180.0,,20000570,"[[-8.666757,41.174055],[-8.666784,41.174064],[...",270,,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
2,1372637254620000657,A,39233.0,,20000657,"[[-8.660646,41.168574],[-8.661087,41.167926],[...",630,,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
3,1372637658620000596,A,22864.0,,20000596,"[[-8.665686,41.170626],[-8.665677,41.170653],[...",375,,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True
4,1372639535620000161,A,25862.0,,20000161,"[[-8.648226,41.148333],[-8.648514,41.148297],[...",840,,1.224647e-16,-1.0,-0.201299,0.97953,-0.965926,-0.258819,-0.781831,0.62349,True


In [66]:
# Use only data points with CALL_TYPE A
test_data = test_data[test_data['CALL_TYPE'] == 'A']
test_data.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,START_LOCATION,MON_sin,MON_cos,DAY_sin,DAY_cos,HR_sin,HR_cos,WK_sin,WK_cos,YR_2013
5,T6,A,42612.0,,20000607,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
7,T8,A,31780.0,,20000619,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
21,T22,A,85698.0,,20000199,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
22,T23,A,37007.0,,20000480,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False
36,T37,A,2002.0,,20000159,,-0.866025,-0.5,0.299363,-0.954139,0.5,-0.866025,0.433884,-0.900969,False


In [67]:
from sklearn.model_selection import train_test_split

# We could totally change this. Utilization of these just probably requires further preprocessing.
ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION = ['TRIP_ID', 'CALL_TYPE', 'ORIGIN_STAND', 'POLYLINE', 'START_LOCATION']

train_data_sample = train_data.sample(frac=0.2, random_state=420) # frac is used to control percentage of train data used
X = train_data_sample.drop("TRAVEL_TIME", axis=1)
X = X.loc[:, ~X.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]
y = train_data_sample["TRAVEL_TIME"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

test_features = test_data.loc[:, ~test_data.columns.isin(ALL_FEATURES_NOT_SUITED_FOR_ESTIMATION)]

        ORIGIN_CALL   TAXI_ID       MON_sin   MON_cos   DAY_sin   DAY_cos   
87538        7078.0  20000591 -8.660254e-01  0.500000  0.937752  0.347305  \
170653      43914.0  20000187 -2.449294e-16  1.000000 -0.201299  0.979530   
131133      59689.0  20000347 -5.000000e-01  0.866025 -0.651372 -0.758758   
148697      14674.0  20000450 -2.449294e-16  1.000000  0.988468  0.151428   
48179        2002.0  20000055 -8.660254e-01 -0.500000 -0.848644  0.528964   
...             ...       ...           ...       ...       ...       ...   
342551      18439.0  20000497  1.224647e-16 -1.000000 -0.848644  0.528964   
214656      16752.0  20000280  8.660254e-01  0.500000  0.299363 -0.954139   
308712      41898.0  20000657  5.000000e-01 -0.866025 -0.988468  0.151428   
207210      16931.0  20000229  8.660254e-01  0.500000  0.988468  0.151428   
102226      55855.0  20000595 -8.660254e-01  0.500000 -0.651372 -0.758758   

              HR_sin        HR_cos    WK_sin    WK_cos  YR_2013  
87538   7

In [68]:
from torch.utils.data import DataLoader, Dataset
from mlp_utils import TaxiDataset
    
X_train = X_train.astype(float)
X_test = X_test.astype(float)
X_test_public = test_features.astype(float) # This is what we can predict on

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.tolist(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.tolist(), dtype=torch.float32)
X_test_public_tensor = torch.tensor(X_test_public.values, dtype=torch.float32)

train_dataset = TaxiDataset(X_train_tensor, y_train_tensor)
test_dataset = TaxiDataset(X_test_tensor, y_test_tensor)
test_public_dataset = TaxiDataset(X_test_public_tensor)

batch_size = 32

dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
dataloader_pred = DataLoader(test_public_dataset, batch_size=batch_size)

In [69]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron for regression.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(11, 128),
      nn.Sigmoid(),
      nn.Linear(128, 64),
      nn.Sigmoid(),
      nn.Linear(64, 32),
      nn.Sigmoid(),
      nn.Linear(32, 1)
    )

  def forward(self, x):
    '''
      Forward pass
    '''
    return self.layers(x)

In [70]:
from mlp_utils import num_parameters

mlp = MLP()
if torch.cuda.is_available():
    mlp.cuda()
 
num_parameters(mlp)

11905

In [71]:
from mlp_utils import pipeline
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(mlp.parameters(), lr=0.001, weight_decay=1e-2)

train_losses, test_loss, predictions = pipeline(mlp, optimizer, dataloader_train=dataloader_train, 
                                                dataloader_test=dataloader_test, dataloader_pred=dataloader_pred,
                        			device=device, criterion=criterion, epochs=30)

100%|██████████| 30/30 [00:25<00:00,  1.16it/s]

Training process has finished.





In [72]:
print(f"Final Train Loss: {train_losses[-1]}")
print(f"Test Loss: {test_loss}")

Final Train Loss: 384.07492524588105
Test Loss: 388.2664176308683


In [73]:
from mlp_utils import test_prediction_to_csv

test_prediction_to_csv(predictions, "predicting_five_layer_mlp_call_type_a.csv", test_data)