In [1]:
import torch
import torchvision
import pandas as pd
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
import tqdm 
import copy
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from datetime import timedelta
from datetime import datetime

In [2]:
df_tr = pd.read_csv("kaggle_data/train.csv")
def polyline_to_trip_duration(polyline):
    return max(polyline.count("[") - 2, 0) * 15

df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

def parse_time(x):

    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [3]:
values = df_tr['ORIGIN_CALL'].value_counts()>1000
ogcalls = df_tr['ORIGIN_CALL'].value_counts()[values].index

In [9]:
import holidays
from datetime import date 

In [10]:
ptholi = holidays.PT()
def ptholiday(x):
    dt = datetime.fromtimestamp(x)
    curr = date(dt.year, dt.month, dt.day)
    if curr in ptholi:
        return 2
    if curr + timedelta(days=1) in ptholi:
        return 1
    else:
        return 0

In [11]:
df_tr['holi']= df_tr['TIMESTAMP'].apply(ptholiday)

In [12]:
def fillz(x):
    if x.isna().any():
        return 0
    else:
        return x['ORIGIN_STAND']

In [13]:
df_tr['ORIGIN_STAND'] = df_tr[['ORIGIN_STAND']].apply(fillz, axis=1)

In [14]:
call = []
for i in ogcalls:
    call.append(i)
caenc = OneHotEncoder(handle_unknown='ignore')
caenc.fit(np.array([call]).reshape(-1,1))
call = caenc.categories_
catemp = pd.DataFrame(caenc.transform(df_tr[['ORIGIN_CALL']]).toarray())
catemp.columns = call

In [15]:
ogc = []
for i in range(1,64):
    ogc.append(i*1.0)
ogcenc = OneHotEncoder(handle_unknown='ignore')
ogcenc.fit(np.array([ogc]).reshape(-1,1))
ogc = ogcenc.categories_
otemp = pd.DataFrame(ogcenc.transform(df_tr[['ORIGIN_STAND']]).toarray())
otemp.columns = ogc

In [21]:
dayenc = OneHotEncoder()
dayenc.fit(df_tr[['WK']])
dats = dayenc.categories_
dtemp = pd.DataFrame(dayenc.transform(df_tr[['WK']]).toarray())
dtemp.columns = dats

In [17]:
monenc = OneHotEncoder()
monenc.fit(df_tr[['MON']])
mons = monenc.categories_
mtemp = pd.DataFrame(monenc.transform(df_tr[['MON']]).toarray())
mtemp.columns = mons

In [18]:
hrenc = OneHotEncoder()
hrenc.fit(df_tr[['HR']])
hr = hrenc.categories_
htemp = pd.DataFrame(hrenc.transform(df_tr[['HR']]).toarray())
htemp.columns = hr

In [19]:
callenc = OneHotEncoder()
callenc.fit(df_tr[['CALL_TYPE']])
calls = callenc.categories_
ctemp = pd.DataFrame(callenc.transform(df_tr[['CALL_TYPE']]).toarray())
ctemp.columns = calls

In [22]:
X  = df_tr.reset_index()[['holi']]
X = pd.concat([X,catemp,otemp,dtemp,mtemp,htemp,ctemp],axis=1)

In [23]:
X = X.to_numpy().astype(float)
y = df_tr['LEN'].to_numpy().astype(float)

In [24]:
device = torch.device('cuda')

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1).to(device)

In [39]:
n_epochs = 50   # number of epochs to run
batch_size = 256  # size of each batch
batch_start = torch.arange(0, len(X_train), batch_size)

In [40]:
save = {'w':None,'history':[],'mse':np.inf}
best_mse = save['mse']  # init to infinity
best_weights = save['w']
history_train = []
history_test = []

In [25]:
model = nn.Sequential(
    nn.Linear(116, 256),
    nn.ReLU(),
    nn.Dropout(p=0.3),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Dropout(p=0.3),
    nn.Linear(64, 16),
    nn.ReLU(),
    nn.Dropout(p=0.3),
    nn.Linear(16, 1),
).to(device)

In [86]:
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
model.load_state_dict(torch.load('actual_final_weight.pt'))

<All keys matched successfully>

In [87]:
test = pd.read_csv('kaggle_data/test_public.csv')

In [88]:
tid = test[['TRIP_ID']]
test[["YR", "MON", "DAY", "HR", "WK"]] = test[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

In [89]:
test['holi']= test['TIMESTAMP'].apply(ptholiday)

In [90]:
test['ORIGIN_STAND'] = test[['ORIGIN_STAND']].apply(fillz, axis=1)

In [91]:
call = caenc.categories_
catest = pd.DataFrame(caenc.transform(test[['ORIGIN_CALL']]).toarray())
catest.columns = call
otest = pd.DataFrame(ogcenc.transform(test[['ORIGIN_STAND']]).toarray())
otest.columns = ogc
dtest = pd.DataFrame(dayenc.transform(test[['WK']]).toarray())
dtest.columns = dats
mtest = pd.DataFrame(monenc.transform(test[['MON']]).toarray())
mtest.columns = mons
htest = pd.DataFrame(hrenc.transform(test[['HR']]).toarray())
htest.columns = hr
ctest = pd.DataFrame(callenc.transform(test[['CALL_TYPE']]).toarray())

In [92]:
test = test[['holi']]
test = pd.concat([test,catest,otest,dtest,mtest,htest,ctest],axis=1)
test = test.to_numpy().astype(float)
test = torch.tensor(test, dtype=torch.float32).to(device)

In [93]:
with torch.no_grad():
    model.eval()
    pred = model(test)

In [94]:
pred = pred.cpu().detach().numpy().transpose()[0]
tid['TRAVEL_TIME'] = pred
tid.to_csv('predictions.csv',index=False)

In [95]:
tid

Unnamed: 0,TRIP_ID,TRAVEL_TIME
0,T1,716.851318
1,T2,718.458252
2,T3,716.851318
3,T4,703.663574
4,T5,698.842773
...,...,...
315,T323,676.373047
316,T324,695.800415
317,T325,750.569214
318,T326,676.373047
