In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, validation_curve
import lightgbm as lgb
import xgboost as xgb

import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv("train.csv")

In [5]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))
len(df)

1710670

In [6]:
mean, std = df["target"].mean(), df["target"].std()
median = df["target"].median()
df = df[df["target"] < mean + 3 * std]
df = df[df["target"] >= 4]
len(df)

1656261

In [7]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]
len(df)

1656255

In [8]:
from datetime import datetime
def parse_time(x):
    dt = datetime.utcfromtimestamp(x)
    return dt.year, dt.month, dt.day, dt.hour+1, dt.weekday() #monday: 0, sunday: 6

In [9]:
times = df['TIMESTAMP'].apply(parse_time)
df['year'] = [x for x,y,z,w,a in times]
df['month'] = [y for x,y,z,w,a in times]
df['day'] = [z for x,y,z,w,a in times]
df['hour'] = [w for x,y,z,w,a in times]
df['weekday'] = [a for x,y,z,w,a in times]
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,year,month,day,hour,weekday
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,2013,7,1,1,0
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,2013,7,1,1,0
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,2013,7,1,1,0
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,2013,7,1,1,0
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,2013,7,1,1,0


In [10]:
df_1 = df[ ((df['day'] == 14) & (df['month'] == 8)) 
          | ((df['day'] == 4) & (df['month'] == 10)) |
          ((df['day'] == 9) & (df['month'] == 6)) |
          ((df['day'] == 24) & (df['month'] == 4)) |
          ((df['day'] == 31) & (df['month'] == 4)) |
          ((df['day'] == 31) & (df['month'] == 10)) |
          ((df['day'] == 31) & (df['month'] == 11)) |
          ((df['day'] == 7) & (df['month'] == 12))
         ]
print(len(df_1))
#df_1 = df_1[df_1['hour'] >= 12]
#plt.plot(df_1_1['target'].tolist())
#df_1_1['target'].describe()
df_1 = df_1.assign(DAY_TYPE = 'A')

29980


In [11]:
df_2 = df[ ((df['weekday'] <= 4)) ]
df_2 = df_2[ ((df_2['month'] == 9) | (df_2['month'] == 10)) ]
df_2 = df_2[df_2['hour'] >= 8]
df_2 = df_2[df_2['hour'] <= 10]
df_2 = df_2.assign(DAY_TYPE = 'B')
len(df_2)

38945

In [12]:
df_3 = df[ ((df['weekday'] <= 4)) ]
df_3 = df_3[ ((df_3['month'] == 5) | (df_3['month'] == 6)) ]
df_3 = df_3[df_3['hour'] >= 17]
df_3 = df_3[df_3['hour'] <= 19]
df_3 = df_3.assign(DAY_TYPE = 'C')
len(df_3)

31819

In [13]:
df_4 = df[ ((df['weekday'] == 5)) ]
df_4 = df_4[df_4['hour'] >= 3]
df_4 = df_4[df_4['hour'] <= 5]
df_4 = df_4.assign(DAY_TYPE = 'D')
len(df_4)

42472

In [14]:
df_5 = df[ ((df['day'] >= 20) & (df['month'] == 12) & (df['day'] <= 23)) ]
df_5 = df_5.assign(DAY_TYPE = 'E')
len(df_5)

22172

In [15]:
df_train = pd.concat([df_1, df_2, df_3, df_4, df_5], axis = 0)
len(df_train)

165388

In [16]:
df = df_train

In [17]:
df = df.drop(columns=['TRIP_ID','MISSING_DATA'])
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,target,year,month,day,hour,weekday
200188,B,,34.0,20000010,1376438529,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,14,1,2
200202,C,,,20000304,1376438909,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,14,1,2
200204,B,,34.0,20000572,1376438510,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,14,1,2
200210,C,,,20000570,1376439240,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,14,1,2
200223,B,,9.0,20000173,1376438918,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,14,1,2


In [18]:
df['TAXI_ID'] -= 20000000

In [19]:
df['quarterHr'] = ((df['hour']-1) / 6).astype(int)

In [20]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [21]:
df['day_A'] = df["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df['day_B'] = df["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df['day_C'] = df["DAY_TYPE"].apply(lambda x : int(x == 'C'))
df['day_D'] = df["DAY_TYPE"].apply(lambda x : int(x == 'D'))
df['day_E'] = df["DAY_TYPE"].apply(lambda x : int(x == 'E'))

train_id_mean = df['TAXI_ID'].mean()
train_id_std = df['TAXI_ID'].std()
df['TAXI_ID'] = (df['TAXI_ID'] - train_id_mean)/ train_id_std

In [22]:
df_slice = df[df['ORIGIN_STAND'].isna()]
stand_nan_mean = df_slice['target'].mean()
stand_nan_mean

718.4076538085938

In [23]:
stands = set()
stand_to_mean = {}
df_temp = df[~df['ORIGIN_STAND'].isna()]
for ind, r in df_temp.iterrows():
    stands.add(r['ORIGIN_STAND'])
for i in stands:
    stand_to_mean[i] = df[df['ORIGIN_STAND'] == i]['target'].mean()

In [24]:
def apply_mean_stand(stand):
    if pd.isna(stand):
        return stand_nan_mean
    else:
        return stand_to_mean[stand]

In [25]:
taxi_ids = set()
for ind, r in df.iterrows():
    taxi_ids.add(r['TAXI_ID'])
taxi_id_to_mean = {}
for i in taxi_ids:
    taxi_id_to_mean[i] = df[df['TAXI_ID'] == i]['target'].mean()

In [26]:
def apply_mean(taxi_id):
    return taxi_id_to_mean[taxi_id]

In [27]:
df['TAXI_ID_MEAN'] = df['TAXI_ID'].apply(apply_mean)
df['STAND_MEAN'] = df['ORIGIN_STAND'].apply(apply_mean_stand)
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,target,year,month,...,call_A,call_B,call_C,day_A,day_B,day_C,day_D,day_E,TAXI_ID_MEAN,STAND_MEAN
200188,B,,34.0,10,1376438529,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,...,0,1,0,1,0,0,0,0,687.091064,671.156128
200202,C,,,304,1376438909,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,...,0,0,1,1,0,0,0,0,699.678162,718.407654
200204,B,,34.0,572,1376438510,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,...,0,1,0,1,0,0,0,0,670.640015,671.156128
200210,C,,,570,1376439240,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,...,0,0,1,1,0,0,0,0,684.03894,718.407654
200223,B,,9.0,173,1376438918,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,...,0,1,0,1,0,0,0,0,588.278809,765.755249


In [28]:
train_id_mean = df['TAXI_ID_MEAN'].mean()
train_id_std = df['TAXI_ID_MEAN'].std()

In [29]:
train_stand_mean = df['STAND_MEAN'].mean()
train_stand_std = df['STAND_MEAN'].std()

In [30]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [31]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [32]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]
train_long_mean = df['Init_longitude'].mean()
train_lat_mean = df['Init_latitude'].mean()
train_long_std = df['Init_longitude'].std()
train_lat_std = df['Init_latitude'].std()
normalize('Init_longitude')
normalize('Init_latitude')
len(df)

165388

In [33]:
normalize('STAND_MEAN')
normalize('TAXI_ID_MEAN')

In [34]:
def treat_nan(x):
    if pd.isna(x):
        return -1
    else:
        return x

In [35]:
df['ORIGIN_STAND'] = df['ORIGIN_STAND'].apply(treat_nan)

In [36]:
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,target,year,month,...,call_C,day_A,day_B,day_C,day_D,day_E,TAXI_ID_MEAN,STAND_MEAN,Init_longitude,Init_latitude
200188,B,,34.0,10,1376438529,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,...,0,1,0,0,0,0,-0.113276,-0.444278,0.223187,0.521428
200202,C,,-1.0,304,1376438909,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,...,1,1,0,0,0,0,0.037287,0.381993,1.067421,0.778478
200204,B,,34.0,572,1376438510,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,...,0,1,0,0,0,0,-0.310059,-0.444278,0.226437,0.526508
200210,C,,-1.0,570,1376439240,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,...,1,1,0,0,0,0,-0.149785,0.381993,0.433911,0.853316
200223,B,,9.0,173,1376438918,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,...,0,1,0,0,0,0,-1.295242,1.209945,0.594095,0.625186


In [37]:
vocab_size = len(df['month'].unique()) + len(df['weekday'].unique()) + len(df['hour'].unique()) + len(df['target'].unique())
vocab_size

227

vocabs = set()
for s in df['month'].unique():
    vocabs.add(s)
for s in df['weekday'].unique():
    vocabs.add(s)
for s in df['hour'].unique():
    vocabs.add(s)
for s in df['target'].unique():
    vocabs.add(s)
v_s = sorted(list(vocabs))
v_to_ix = { ch:i for i,ch in enumerate(chars) }


In [38]:
lstm_set = torch.reshape(torch.IntTensor(df['month'].values),(-1,1))
col_list_lstm = ['weekday', 'hour','target']
for col_name in col_list_lstm:
#     print(train_set.dtype)
    lstm_set = torch.cat((lstm_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)
lstm_set = lstm_set.type(torch.LongTensor)

In [54]:
class LSTM(nn.Module):
    def __init__(self, input_size, embedding_size, output_size, hidden_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers = 3)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p = 0.5)
    
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        output = self.dropout(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

In [55]:
model = LSTM(input_size=vocab_size, embedding_size=vocab_size, output_size=vocab_size, hidden_size=256)
loss_fn = nn.functional.nll_loss
#loss_fn = nn.CrossEntropyLoss()


In [53]:
lstm_set[0]

tensor([  8,   2,   1, 480])

In [56]:
from tqdm import tqdm

In [59]:
epochs = 15
losses = []
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
for i_epoch in tqdm(range(15)):
    
    running_loss = 0
    for data in lstm_set:

        hidden_state = None
        input_seq = data[:3]
        target_seq = data[1:]

        # forward pass
        output, _ = model(input_seq, hidden_state)
        print(len(output))
        print(len(torch.squeeze(output)))
        print(len(torch.squeeze(target_seq)))
        #print(torch.squeeze(output))
        #print(target_seq)
        # compute loss
        loss = loss_fn(torch.squeeze(output), torch.squeeze(target_seq))
        running_loss += loss.item()

        # compute gradients and take optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    losses.append(running_loss)
    # print loss
    if i_epoch % 1 == 0:
        print("Epoch: {0} \t Loss: {1:.8f}".format(i_epoch, running_loss/n))

  0%|          | 0/15 [00:00<?, ?it/s]

3
3
3





IndexError: Target 480 is out of bounds.

In [None]:
#df.to_csv('cleaned.csv',index=False)

In [None]:
stands = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
stand_dict = {}
stands.head()
for r, s in stands.iterrows():
    stand_dict[s['ID']] = (np.float32(s['Longitude']), np.float32(s['Latitude']))

In [None]:
df_test = pd.read_csv("test_public.csv")

In [None]:
df_test = df_test.drop(columns=['ORIGIN_CALL','MISSING_DATA'])
df_test.head()

In [None]:
times = df_test['TIMESTAMP'].apply(parse_time)
df_test['year'] = [x for x,y,z,w,a in times]
df_test['month'] = [y for x,y,z,w,a in times]
df_test['day'] = [z for x,y,z,w,a in times]
df_test['hour'] = [w for x,y,z,w,a in times]
df_test['weekday'] = [a for x,y,z,w,a in times]
df_test.head()

In [None]:
len(df_test)

In [None]:
df_test1 = df_test[(df_test['day'] == 14) & (df_test['month'] == 8)]
#df_test1['DAY_TYPE'] = 'C'
#8/14, one day before holiday, 18pm
print(len(df_test1))
df_test1 = df_test1.assign(DAY_TYPE = 'A')
#df_test1.head()

In [None]:
df_test2 = df_test[ ((df_test['day'] == 30) & (df_test['month'] == 9))] 
print(len(df_test2))
df_test2 = df_test2.assign(DAY_TYPE = 'B')
#9/30, normal weekday 7-9am

In [None]:
df_test3 = df_test[(df_test['day'] == 6) & (df_test['month'] == 10)]
print(len(df_test3))
df_test3 = df_test3.assign(DAY_TYPE = 'C')
#10/6, normal weekday, 18pm

In [None]:
df_test4 = df_test[(df_test['day'] == 1) & (df_test['month'] == 11)]
print(len(df_test4))
df_test4 = df_test4.assign(DAY_TYPE = 'D')

#11/1, Saturday 4am (??)

In [None]:
df_test5 = df_test[(df_test['day'] == 21) & (df_test['month'] == 12)]
print(len(df_test5))
df_test5 = df_test5.assign(DAY_TYPE = 'E')

#12/21, sunday 3pm before christmas

In [None]:
df_test = pd.concat([df_test1, df_test2, df_test3, df_test4, df_test5], axis = 0)
df_test

In [None]:

df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_A'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df_test['day_B'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df_test['day_C'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_D'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'D'))
df_test['day_E'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'E'))


In [None]:
df_test['STAND_MEAN'] = df_test['ORIGIN_STAND'].apply(apply_mean_stand)
df_test['STAND_MEAN'] = np.float32((df_test['STAND_MEAN'] - train_stand_mean)/train_stand_std)

In [None]:
def apply_mean_test(taxi_id):
    if taxi_id in taxi_id_to_mean:
        return taxi_id_to_mean[taxi_id]
    else:
        return train_id_mean

In [None]:

df_test['TAXI_ID_MEAN'] = df_test['TAXI_ID'].apply(apply_mean_test)
df_test['TAXI_ID_MEAN'] = np.float32((df_test['TAXI_ID_MEAN'] - train_id_mean)/train_id_std)

In [None]:
def apply_long(x): #in: origin stand
    if not pd.isna(x):
        return stand_dict[x][0]
    else:
        return x

In [None]:
def apply_lat(x):
    if not pd.isna(x):
        return stand_dict[x][1]
    else:
        return x

In [None]:
df_test['Init_longitude'] = df_test['ORIGIN_STAND'].apply(apply_long)
df_test['Init_latitude'] = df_test['ORIGIN_STAND'].apply(apply_lat)

In [None]:
df_test['quarterHr'] = ((df_test['hour']-1) / 6).astype(int)
df_test['TAXI_ID'] -= 20000000


In [None]:
df_test[:10]

In [None]:
df_test['ORIGIN_STAND'] = df_test['ORIGIN_STAND'].apply(treat_nan)

In [None]:

df_test['Init_longitude'] = (df_test['Init_longitude'] - train_long_mean) / train_long_std
df_test['Init_latitude'] = (df_test['Init_latitude'] - train_lat_mean) / train_lat_std

In [None]:
df_test[:10]

In [None]:

#    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C', 'day_A', 'day_B', 'day_C', 'day_D', 'day_E']
            #,'Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
#col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [None]:
def apply_embed_test(x, i):
    t = torch.reshape(torch.from_numpy(x.values),(-1,1))
    embedding = nn.Embedding(i+1, 10)

    res = embedding(t)
    return res

In [None]:
ae = apply_embed_test(df_test['TAXI_ID'], df['TAXI_ID'].max())
ae1 = apply_embed_test(df_test['quarterHr'], df['quarterHr'].max())
ae2 = apply_embed_test(df_test['day'], df['day'].max())
ae3 = apply_embed_test(df_test['weekday'], df['weekday'].max())

In [None]:
c = torch.cat((ae,ae1,ae2,ae3), dim = 1)
embed_set = []
for i in range(len(c)):
    embed_set.append(torch.flatten(c[i]))
et = torch.stack(embed_set)
c = torch.cat( (test_set, et), dim = 1)
len(c[0]), c[0]


In [None]:
test_set = c
test_set = test_set.type('torch.FloatTensor')
test_set = test_set.clone().detach()

In [None]:
test_set[0]

In [None]:
preds = mlp_model(test_set.to(device)).cpu().detach().numpy()
#preds = best_lgbm.predict(test_set)

In [None]:
def round_to_multiple(number, multiple):
    return multiple * round(number / multiple)

In [None]:
for i in range(len(preds)):
    preds[i] = round_to_multiple(preds[i], 15)

In [None]:
output_csv = pd.read_csv("sampleSubmission.csv")
output_csv['TRAVEL_TIME'] = preds
#output_csv['TRAVEL_TIME'] = preds
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)


In [None]:
sum(p.numel() for p in mlp_model.parameters())