In [11]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
from IPython.display import display
from collections import defaultdict

zipped_data_path = "../data/clean_data/class-competition-not-one-hot-encoders.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zip:
    for filename in zip.namelist():
        if filename.endswith(".csv"):
            with zip.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

train_data = dataframes["train.csv"]
test_data  = dataframes["test_public.csv"]

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

FILE: train.csv
FILE: test_public.csv
cuda:0


In [12]:
from sklearn.model_selection import train_test_split

ALL_FEATURES = ['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'MISSING_DATA', 'TIMESTAMP'
 'POLYLINE', 'TRAVEL_TIME', 'YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']

# I just want to train on a couple features
FEATURES_SUITED_FOR_ESTIMATION = ['TRAVEL_TIME', 'TIMESTAMP', 'YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']

train_data_sample = train_data.sample(frac=0.2, random_state=420) # frac is used to control percentage of train data used
X = train_data_sample.drop("TRAVEL_TIME", axis=1)
X = X.loc[:, X.columns.isin(FEATURES_SUITED_FOR_ESTIMATION)]
y = train_data_sample["TRAVEL_TIME"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=420)

test_features = test_data.loc[:, test_data.columns.isin(FEATURES_SUITED_FOR_ESTIMATION)]

In [13]:
X.head()

Unnamed: 0,TIMESTAMP,YR,MON,DAY,HR,WK,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C
958889,1390572538,2014,1,24,6,4,False,True,False
181477,1376062304,2013,8,9,8,4,True,False,False
525127,1382430060,2013,10,22,1,1,False,True,False
1006306,1391537128,2014,2,4,10,1,True,False,False
617077,1384033267,2013,11,9,13,5,False,False,True


In [14]:
from torch.utils.data import DataLoader, Dataset
from mlp_utils import TaxiDataset
    
X_train = X_train.astype(float)
X_test = X_test.astype(float)
X_test_public = test_features.astype(float) # This is what we can predict on

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.tolist(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.tolist(), dtype=torch.float32)
X_test_public_tensor = torch.tensor(X_test_public.values, dtype=torch.float32)

train_dataset = TaxiDataset(X_train_tensor, y_train_tensor)
test_dataset = TaxiDataset(X_test_tensor, y_test_tensor)
test_public_dataset = TaxiDataset(X_test_public_tensor)

batch_size = 32

dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
dataloader_pred = DataLoader(test_public_dataset, batch_size=batch_size)

In [15]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron for regression.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(9, 256),
      nn.Sigmoid(),
      nn.Linear(256, 128),
      nn.Sigmoid(),
      nn.Linear(128, 64),
      nn.Sigmoid(),
      nn.Linear(64, 32),
      nn.Sigmoid(),
      nn.Linear(32, 1)
    )

  def forward(self, x):
    '''
      Forward pass
    '''
    return self.layers(x)

In [16]:
from mlp_utils import num_parameters

mlp = MLP()
if torch.cuda.is_available():
    mlp.cuda()
 
num_parameters(mlp)

45825

In [17]:
from mlp_utils import pipeline
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.SGD(mlp.parameters(), lr=0.001, weight_decay=1e-2)

train_losses, test_loss, predictions = pipeline(mlp, optimizer, dataloader_train=dataloader_train, 
                                                dataloader_test=dataloader_test, dataloader_pred=dataloader_pred,
                        			device=device, criterion=criterion)

100%|██████████| 10/10 [03:07<00:00, 18.77s/it]


Training process has finished.


In [18]:
print(f"Final Train Loss: {train_losses[-1]}")
print(f"Test Loss: {test_loss}")

Final Train Loss: 693.543380563071
Test Loss: 687.854970199424


In [19]:
from mlp_utils import test_prediction_to_csv

test_prediction_to_csv(predictions, "predicting_five_layer_mlp_other_features", test_data)