In [1]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict

zipped_data_path = "../data/clean_data/class-competition-not-one-hot-encoders.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zipped:
    for filename in zipped.namelist():
        if filename.endswith(".csv"):
            with zipped.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

train_data = dataframes["train.csv"]
test_data  = dataframes["test_public.csv"]

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

FILE: train.csv
FILE: test_public.csv
cuda:0


In [2]:
ALL_FEATURES = ['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'MISSING_DATA', 'TIMESTAMP'
 'POLYLINE', 'TRAVEL_TIME', 'YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']

# I just want to train on a couple features
FEATURES_SUITED_FOR_ESTIMATION = ['TAXI_ID', 'TRAVEL_TIME', 'TIMESTAMP', 'YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']

X = train_data.loc[:, train_data.columns.isin(FEATURES_SUITED_FOR_ESTIMATION)]
y = train_data["TRAVEL_TIME"]

test_features = test_data.loc[:, test_data.columns.isin(FEATURES_SUITED_FOR_ESTIMATION)]

X = X.drop(['TRAVEL_TIME', 'TIMESTAMP'], axis=1)
test_features = test_features.drop(['TIMESTAMP'], axis=1)
test_features.head()

Unnamed: 0,TAXI_ID,YR,MON,DAY,HR,WK,CALL_TYPE_A,CALL_TYPE_B,CALL_TYPE_C
0,20000542,2014,8,14,10,3,False,True,False
1,20000108,2014,8,14,10,3,False,True,False
2,20000370,2014,8,14,10,3,False,True,False
3,20000492,2014,8,14,10,3,False,True,False
4,20000621,2014,8,14,10,3,False,True,False


In [3]:
taxi_ids = sorted(list(set(X['TAXI_ID'].unique())))
# original_calls = sorted(list(set(X['ORIGIN_CALL'].unique())))
# data_size, unique_taxi_id_size, unique_origin_call_size = len(X), len(taxi_ids), len(original_calls)
data_size, unique_taxi_id_size = len(X), len(taxi_ids)
print("----------------------------------------")
# print("Data has {} data points, {} unique taxi ids, {} unique origin calls".format(data_size, unique_taxi_id_size, unique_origin_call_size))
print("Data has {} data points, {} unique taxi ids".format(data_size, unique_taxi_id_size))
print("----------------------------------------")

# id to index and index to id maps
taxi_id_to_ix = { id:i for i, id in enumerate(taxi_ids) }
# origin_call_to_ix = { id:i for i, id in enumerate(original_calls) }

X['TAXI_ID'] = X['TAXI_ID'].map(taxi_id_to_ix)
test_features['TAXI_ID'] = test_features['TAXI_ID'].map(taxi_id_to_ix)

----------------------------------------
Data has 1710660 data points, 448 unique taxi ids
----------------------------------------


In [4]:
from torch.utils.data import DataLoader, Dataset
from rnn_utils import TaxiDataset

X = X.astype(int)
test_features = test_features.astype(int) # This is what we can predict on

X_taxi_id_tensor = torch.tensor(X['TAXI_ID'].values, dtype=torch.int64)
X_other_features_tensor = torch.tensor(X[['YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']].values, dtype=torch.int64)
y_tensor = torch.tensor(y.tolist(), dtype=torch.int64)
X_taxi_id_test_public_tensor = torch.tensor(test_features['TAXI_ID'].values, dtype=torch.int64)
X_other_features_test_public_tensor = torch.tensor(test_features[['YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']].values, dtype=torch.int64)

X_taxi_id_dataset = TaxiDataset(X_taxi_id_tensor, y_tensor)
X_other_features_dataset = TaxiDataset(X_other_features_tensor, y_tensor)
X_taxi_id_test_public_dataset = TaxiDataset(X_taxi_id_test_public_tensor)
X_other_features_test_public_dataset = TaxiDataset(X_other_features_test_public_tensor)

batch_size = 32

dataloader_taxi_train = DataLoader(X_taxi_id_dataset, batch_size=batch_size)
dataloader_other_train = DataLoader(X_other_features_dataset, batch_size=batch_size)
dataloader_taxi_pred = DataLoader(X_taxi_id_test_public_dataset, batch_size=batch_size)
dataloader_other_pred = DataLoader(X_other_features_test_public_dataset, batch_size=batch_size)

In [5]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, other_features_size, output_size, hidden_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=(embedding_size + other_features_size), hidden_size=hidden_size, batch_first=True)
        self.decoder = nn.Linear(hidden_size, output_size)
        # self.softmax = nn.Softmax()

    def forward(self, input_seq, other_features, hidden_state):
        embedding = self.embedding(input_seq)
        combined_input = torch.cat((embedding, other_features), dim=1)
        output, hidden_state = self.rnn(combined_input, hidden_state)
        output = self.decoder(output)
        # output = self.softmax(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

In [6]:
from rnn_utils import num_parameters

rnn = RNN(input_size=unique_taxi_id_size, embedding_size=unique_taxi_id_size, other_features_size=8, output_size=1, hidden_size=200) # 8 additional features, output to a single value
if torch.cuda.is_available():
    rnn.cuda()
 
num_parameters(rnn)

import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(rnn.parameters(), lr=1e-4)

In [7]:
epochs = 10

for i_epoch in range(1, epochs+1):
        
    n = 0
    running_loss = 0
    
    for (taxi_id_seq_batch, labels_batch), other_features_seq_batch in zip(dataloader_taxi_train, dataloader_other_train):
        optimizer.zero_grad()

        taxi_id_seq = taxi_id_seq_batch.to(device)  # Access the first (and only) element of the list
        other_features_seq = other_features_seq_batch[0].to(device)  # Access the first (and only) element of the list
        labels = labels_batch.to(device)
        
        hidden_state = None
        
        # Forward pass
        outputs, hidden_state = rnn(taxi_id_seq, other_features_seq, hidden_state)
        
        # Calculate the loss
        loss = criterion(outputs.squeeze(), labels.float())
        running_loss += loss.item()
        n += 1
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
    # print loss after every epoch
    print("Epoch: {0} \t Loss: {1:.8f}".format(i_epoch, math.sqrt(running_loss/n)))

Epoch: 1 	 Loss: 903.13415014
Epoch: 2 	 Loss: 760.99689677
Epoch: 3 	 Loss: 695.50881834
Epoch: 4 	 Loss: 685.27308394
Epoch: 5 	 Loss: 684.70472143
Epoch: 6 	 Loss: 684.49031175
Epoch: 7 	 Loss: 684.42446881
Epoch: 8 	 Loss: 684.36454175
Epoch: 9 	 Loss: 684.30743713
Epoch: 10 	 Loss: 684.25262404


In [14]:
predictions = []

rnn.eval()
with torch.no_grad():
    for taxi_id_seq_batch, other_features_seq_batch in zip(dataloader_taxi_pred, dataloader_other_pred):
            taxi_id_seq = taxi_id_seq_batch.to(device)  # Access the first (and only) element of the list
            other_features_seq = other_features_seq_batch.to(device)  # Access the first (and only) element of the list
            
            hidden_state = None
            
            # Forward pass
            outputs, hidden_state = rnn(taxi_id_seq, other_features_seq, hidden_state)

            # Append predictions to the list
            predictions.append(outputs.cpu().numpy())
            
predictions = np.concatenate(predictions)

In [16]:
from rnn_utils import test_prediction_to_csv
test_prediction_to_csv(predictions, "LSTM_200_lr-1e-4_Adam.csv", test_data)