In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, validation_curve
import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv("train.csv")

In [5]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))
len(df)

1710670

In [6]:
mean, std = df["target"].mean(), df["target"].std()
median = df["target"].median()
df = df[df["target"] < mean + 3 * std]
df = df[df["target"] >= 4]
len(df)

1656261

In [7]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]
len(df)

1656255

In [8]:
from datetime import datetime
def parse_time(x):
    dt = datetime.utcfromtimestamp(x)
    return dt.year, dt.month, dt.day, dt.hour+1, dt.weekday() #monday: 0, sunday: 6

In [9]:
times = df['TIMESTAMP'].apply(parse_time)
df['year'] = [x for x,y,z,w,a in times]
df['month'] = [y for x,y,z,w,a in times]
df['day'] = [z for x,y,z,w,a in times]
df['hour'] = [w for x,y,z,w,a in times]
df['weekday'] = [a for x,y,z,w,a in times]
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,year,month,day,hour,weekday
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,2013,7,1,1,0
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,2013,7,1,1,0
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,2013,7,1,1,0
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,2013,7,1,1,0
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,2013,7,1,1,0


In [10]:
df_1 = df[ ((df['day'] == 14) & (df['month'] == 8)) 
          | ((df['day'] == 4) & (df['month'] == 10)) |
          ((df['day'] == 9) & (df['month'] == 6)) |
          ((df['day'] == 24) & (df['month'] == 4)) |
          ((df['day'] == 31) & (df['month'] == 4)) |
          ((df['day'] == 31) & (df['month'] == 10)) |
          ((df['day'] == 31) & (df['month'] == 11)) |
          ((df['day'] == 7) & (df['month'] == 12))
         ]
print(len(df_1))
#df_1 = df_1[df_1['hour'] >= 12]
#plt.plot(df_1_1['target'].tolist())
#df_1_1['target'].describe()
df_1 = df_1.assign(DAY_TYPE = 'A')

29980


In [11]:
df_2 = df[ ((df['weekday'] <= 4)) ]
df_2 = df_2[ ((df_2['month'] == 9) | (df_2['month'] == 10)) ]
df_2 = df_2[df_2['hour'] >= 8]
df_2 = df_2[df_2['hour'] <= 10]
df_2 = df_2.assign(DAY_TYPE = 'B')
len(df_2)

38945

In [12]:
df_3 = df[ ((df['weekday'] <= 4)) ]
df_3 = df_3[ ((df_3['month'] == 5) | (df_3['month'] == 6)) ]
df_3 = df_3[df_3['hour'] >= 17]
df_3 = df_3[df_3['hour'] <= 19]
df_3 = df_3.assign(DAY_TYPE = 'C')
len(df_3)

31819

In [13]:
df_4 = df[ ((df['weekday'] == 5)) ]
df_4 = df_4[df_4['hour'] >= 3]
df_4 = df_4[df_4['hour'] <= 5]
df_4 = df_4.assign(DAY_TYPE = 'D')
len(df_4)

42472

In [14]:
df_5 = df[ ((df['day'] >= 20) & (df['month'] == 12) & (df['day'] <= 23)) ]
df_5 = df_5.assign(DAY_TYPE = 'E')
len(df_5)

22172

In [15]:
df_train = pd.concat([df_1, df_2, df_3, df_4, df_5], axis = 0)
len(df_train)

165388

In [16]:
df = df_train

In [17]:
df = df.drop(columns=['TRIP_ID','MISSING_DATA'])
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,target,year,month,day,hour,weekday
200188,B,,34.0,20000010,1376438529,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,14,1,2
200202,C,,,20000304,1376438909,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,14,1,2
200204,B,,34.0,20000572,1376438510,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,14,1,2
200210,C,,,20000570,1376439240,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,14,1,2
200223,B,,9.0,20000173,1376438918,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,14,1,2


In [102]:
df['month'] = df['month'] + 33
df['weekday'] = df['weekday'] + 25

In [103]:
df['month']

200188    41
200202    41
200204    41
200210    41
200223    41
          ..
851041    45
854954    45
860307    45
866700    45
978668    45
Name: month, Length: 165388, dtype: int64

In [104]:
vocab_size = len(df['month'].unique()) + len(df['weekday'].unique()) + len(df['hour'].unique()) + len(df['target'].unique())
vocab_size

227

In [105]:
vocabs = set()
for s in df['month'].unique():
    vocabs.add(s)
for s in df['weekday'].unique():
    vocabs.add(s)
for s in df['hour'].unique():
    vocabs.add(s)
for s in df['target'].unique():
    vocabs.add(s)
v_s = sorted(list(vocabs))
v_to_ix = { ch:i for i,ch in enumerate(vocabs) }
ix_to_v = { i:ch for i,ch in enumerate(vocabs) }

In [106]:
v_num = []
for ind, row in df.iterrows():
    newS = [v_to_ix[row['month']], v_to_ix[row['weekday']], v_to_ix[row['hour']], v_to_ix[row['target']] ]
    v_num.append(newS)

In [107]:
tensors = []
for s in v_num:
    tensors.append(torch.tensor(s).to(device))

In [119]:
class LSTM(nn.Module):
    def __init__(self, input_size, embedding_size, output_size, hidden_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p = 0.25)
    
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        output = self.dropout(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

In [123]:
model = LSTM(input_size=vocab_size, embedding_size=vocab_size, output_size=vocab_size, hidden_size=256).to(device)
#loss_fn = nn.functional.nll_loss
loss_fn = nn.CrossEntropyLoss()


In [124]:
from tqdm import tqdm

In [125]:
losses = []
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
for i_epoch in tqdm(range(10)):
    
    running_loss = 0
    for data in tensors:

        hidden_state = None
        input_seq = data[:3]
        target_seq = data[1:]

        # forward pass
        output, _ = model(input_seq, hidden_state)

        # compute loss
        loss = loss_fn(torch.squeeze(output), torch.squeeze(target_seq))
        running_loss += loss.item()

        # compute gradients and take optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    losses.append(running_loss)
    # print loss
    if i_epoch % 1 == 0:
        print('Loss: ', running_loss)

 10%|█         | 1/10 [04:24<39:42, 264.75s/it]

Epoch:  0
Loss:  510238.04800879955


 20%|██        | 2/10 [08:49<35:19, 264.94s/it]

Epoch:  1
Loss:  474628.75629496574


 30%|███       | 3/10 [13:15<30:58, 265.45s/it]

Epoch:  2
Loss:  464685.8612962961


 40%|████      | 4/10 [17:41<26:31, 265.33s/it]

Epoch:  3
Loss:  460017.9900778532


 50%|█████     | 5/10 [22:06<22:06, 265.31s/it]

Epoch:  4
Loss:  457500.83774638176


 60%|██████    | 6/10 [26:31<17:41, 265.44s/it]

Epoch:  5
Loss:  455201.5939184427


 70%|███████   | 7/10 [30:56<13:15, 265.11s/it]

Epoch:  6
Loss:  453625.3218514919


 70%|███████   | 7/10 [32:32<13:56, 278.95s/it]


KeyboardInterrupt: 

In [126]:
torch.save(model.state_dict(), 'lstm_2')

In [127]:
#model = torch.load('lstm_1')

In [128]:
#df.to_csv('cleaned.csv',index=False)

In [129]:
df_test = pd.read_csv("test_public.csv")

In [130]:
times = df_test['TIMESTAMP'].apply(parse_time)
df_test['year'] = [x for x,y,z,w,a in times]
df_test['month'] = [y for x,y,z,w,a in times]
df_test['day'] = [z for x,y,z,w,a in times]
df_test['hour'] = [w for x,y,z,w,a in times]
df_test['weekday'] = [a for x,y,z,w,a in times]
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,year,month,day,hour,weekday
0,T1,B,,15.0,20000542,1408039037,A,False,2014,8,14,18,3
1,T2,B,,57.0,20000108,1408038611,A,False,2014,8,14,18,3
2,T3,B,,15.0,20000370,1408038568,A,False,2014,8,14,18,3
3,T4,B,,53.0,20000492,1408039090,A,False,2014,8,14,18,3
4,T5,B,,18.0,20000621,1408039177,A,False,2014,8,14,18,3


In [131]:
len(df_test)

320

In [132]:
df_test['month'] = df_test['month'] + 33
df_test['weekday'] = df_test['weekday'] + 25

In [133]:
v_num1 = []
for ind, row in df_test.iterrows():
    newS = [v_to_ix[row['month']], v_to_ix[row['weekday']], v_to_ix[row['hour']] ]
    v_num1.append(newS)
tensors_test = []
for s in v_num1:
    tensors_test.append(torch.tensor(s).to(device))

In [134]:
tensors_test[0]

tensor([40, 27, 17], device='cuda:0')

In [135]:
preds = []
for t in tensors_test:
    hidden_init = None
    output, _ = model(t, hidden_init)
    output = output[-1]
    output = nn.functional.softmax(output).cpu().detach().numpy()
    prediction = np.argmax(output)
    preds.append(ix_to_v[prediction])

In [136]:
len(preds)

320

In [137]:
output_csv = pd.read_csv("sampleSubmission.csv")
output_csv['TRAVEL_TIME'] = preds
#output_csv['TRAVEL_TIME'] = preds
output_csv.tail()
output_csv.to_csv("my_pred_lstm.csv", index=None)


In [138]:
sum(p.numel() for p in model.parameters())

606508