In [1]:
import torch
import torch.nn as nn
import torchvision
import pandas as pd
import numpy as np
import zipfile
import io
import pandas as pd
import math
from IPython.display import display
from collections import defaultdict

zipped_data_path = "../data/clean_data/class-competition-not-one-hot-encoders.zip"

dataframes = defaultdict(pd.DataFrame)
with zipfile.ZipFile(zipped_data_path, "r") as zipped:
    for filename in zipped.namelist():
        if filename.endswith(".csv"):
            with zipped.open(filename) as f:
                dataframes.update({ filename : pd.read_csv(io.TextIOWrapper(f)) })

                # Lets take a look at the files
                print(f"FILE: {filename}")
                # If you want to see file info uncomment this:
                # display(dataframes[filename].info())
                # display(dataframes[filename].head())

train_data = dataframes["train.csv"]
test_data  = dataframes["test_public.csv"]

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

FILE: train.csv
FILE: test_public.csv
cuda:0


In [8]:
ALL_FEATURES = ['TRIP_ID', 'ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID', 'MISSING_DATA', 'TIMESTAMP'
 'POLYLINE', 'TRAVEL_TIME', 'YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']

# I just want to train on a couple features
FEATURES_SUITED_FOR_ESTIMATION = ['TAXI_ID', 'TRAVEL_TIME', 'TIMESTAMP', 'YR', 'MON', 'DAY', 'HR', 'WK', 'CALL_TYPE_A', 'CALL_TYPE_B', 'CALL_TYPE_C']

X = train_data.loc[:, train_data.columns.isin(FEATURES_SUITED_FOR_ESTIMATION)]

test_features = test_data.loc[:, test_data.columns.isin(FEATURES_SUITED_FOR_ESTIMATION)]

In [9]:
taxi_ids = sorted(list(set(X['TAXI_ID'].unique())))
num_taxi_ids = len(taxi_ids)

# id to index and index to id maps
taxi_id_to_ix = { id:i for i, id in enumerate(taxi_ids) }

X['TAXI_ID'] = X['TAXI_ID'].map(taxi_id_to_ix)
test_features['TAXI_ID'] = test_features['TAXI_ID'].map(taxi_id_to_ix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['TAXI_ID'] = X['TAXI_ID'].map(taxi_id_to_ix)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features['TAXI_ID'] = test_features['TAXI_ID'].map(taxi_id_to_ix)


In [27]:
data = X['TAXI_ID'].to_list()
travel_time = X['TRAVEL_TIME'].to_list()
data = torch.tensor(data).to(device)
travel_time = torch.tensor(travel_time, dtype=torch.float).to(device)

test_data = test_features['TAXI_ID'].to_list()
test_data = torch.tensor(test_data).to(device)
test_data = test_data.unsqueeze(0)

In [15]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, output_size, hidden_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size)
        self.decoder = nn.Linear(hidden_size, output_size)
        #self.softmax = nn.Softmax()
    
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        #output = self.softmax(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

In [25]:
model = RNN(input_size=num_taxi_ids, embedding_size=num_taxi_ids, output_size=1, hidden_size=100).to(device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [26]:
epochs = 10
sequence_len = 1000
nth_sequence = 100

for i_epoch in range(1, epochs+1):
        
    n = 0
    running_loss = 0
    
    for i in range(sequence_len,len(data)-1, nth_sequence):
        hidden_state = None
        input_seq = data[i-sequence_len : i]
        target_seq = travel_time[i-sequence_len+1 : i+1]
        
        # forward pass
        output, _ = model(input_seq, hidden_state)
        
        # compute loss
        loss = loss_fn(torch.squeeze(output), torch.squeeze(target_seq))
        running_loss += loss.item()
        n += 1
        
        # compute gradients and take optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # print loss after every epoch
    print("Epoch: {0} \t Loss: {1:.8f}".format(i_epoch, math.sqrt(running_loss/n)))

Epoch: 1 	 Loss: 927.40137336
Epoch: 2 	 Loss: 823.81673755
Epoch: 3 	 Loss: 745.44166202
Epoch: 4 	 Loss: 699.49922783
Epoch: 5 	 Loss: 685.72254272
Epoch: 6 	 Loss: 684.89187497
Epoch: 7 	 Loss: 684.87030508
Epoch: 8 	 Loss: 684.65759902
Epoch: 9 	 Loss: 683.80632996
Epoch: 10 	 Loss: 681.97024941


In [28]:
with torch.no_grad():
    predicted_output, _ = model(test_data, None)

# Convert the predicted output to a numpy array
predictions = predicted_output.cpu().numpy()

In [29]:
predictions

array([[[588.9944 ],
        [589.1085 ],
        [588.58716],
        [586.09094],
        [589.2667 ],
        [589.47687],
        [578.4685 ],
        [589.48627],
        [584.63794],
        [589.4938 ],
        [589.2628 ],
        [587.327  ],
        [589.444  ],
        [589.47455],
        [589.1253 ],
        [588.0519 ],
        [589.4844 ],
        [589.3273 ],
        [589.369  ],
        [589.37683],
        [589.47266],
        [586.4623 ],
        [589.4935 ],
        [589.46484],
        [589.4825 ],
        [589.4445 ],
        [580.09216],
        [589.49365],
        [586.6155 ],
        [587.43225],
        [588.2897 ],
        [589.48846],
        [588.188  ],
        [575.5845 ],
        [589.4712 ],
        [588.2638 ],
        [589.4618 ],
        [589.49414],
        [588.8816 ],
        [585.5627 ],
        [589.4916 ],
        [589.49384],
        [587.68274],
        [589.4897 ],
        [588.90454],
        [571.3799 ],
        [589.3508 ],
        [589.