In [136]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, validation_curve
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt

In [137]:
import warnings
warnings.filterwarnings('ignore')

In [138]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [139]:
df = pd.read_csv("train.csv")

In [140]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))
len(df)

1710670

In [141]:
mean, std = df["target"].mean(), df["target"].std()
median = df["target"].median()
#df = df[df["target"] < mean + 3 * std]
df = df[df["target"] < 15 * 185]
df = df[df["target"] >= 4]
len(df)

1656261

In [142]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]
len(df)

1656255

In [143]:
from datetime import datetime
def parse_time(x):
    dt = datetime.utcfromtimestamp(x)
    return dt.year, dt.month, dt.day, dt.hour+1, dt.weekday() #monday: 0, sunday: 6

In [144]:
times = df['TIMESTAMP'].apply(parse_time)
df['year'] = [x for x,y,z,w,a in times]
df['month'] = [y for x,y,z,w,a in times]
df['day'] = [z for x,y,z,w,a in times]
df['hour'] = [w for x,y,z,w,a in times]
df['weekday'] = [a for x,y,z,w,a in times]
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,year,month,day,hour,weekday
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,2013,7,1,1,0
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,2013,7,1,1,0
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,2013,7,1,1,0
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,2013,7,1,1,0
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,2013,7,1,1,0


In [145]:
df_1 = df[ ((df['day'] == 14) & (df['month'] == 8)) 
          | ((df['day'] == 4) & (df['month'] == 10)) |
          ((df['day'] == 9) & (df['month'] == 6)) |
          ((df['day'] == 24) & (df['month'] == 4)) |
          ((df['day'] == 31) & (df['month'] == 4)) |
          ((df['day'] == 31) & (df['month'] == 10)) |
          ((df['day'] == 31) & (df['month'] == 11)) |
          ((df['day'] == 7) & (df['month'] == 12))
         ]
print(len(df_1))
#df_1 = df_1[df_1['hour'] >= 12]
#plt.plot(df_1_1['target'].tolist())
#df_1_1['target'].describe()
df_1 = df_1.assign(DAY_TYPE = 'A')

29980


In [146]:
df_2 = df[ ((df['weekday'] <= 4)) ]
df_2 = df_2[ ((df_2['month'] == 9) | (df_2['month'] == 10)) ]
df_2 = df_2[df_2['hour'] >= 8]
df_2 = df_2[df_2['hour'] <= 10]
df_2 = df_2.assign(DAY_TYPE = 'B')
len(df_2)

38945

In [147]:
df_3 = df[ ((df['weekday'] <= 4)) ]
df_3 = df_3[ ((df_3['month'] == 5) | (df_3['month'] == 6)) ]
df_3 = df_3[df_3['hour'] >= 17]
df_3 = df_3[df_3['hour'] <= 19]
df_3 = df_3.assign(DAY_TYPE = 'C')
len(df_3)

31819

In [148]:
df_4 = df[ ((df['weekday'] == 5)) ]
df_4 = df_4[df_4['hour'] >= 3]
df_4 = df_4[df_4['hour'] <= 5]
df_4 = df_4.assign(DAY_TYPE = 'D')
len(df_4)

42472

In [149]:
df_5 = df[ ((df['day'] >= 20) & (df['month'] == 12) & (df['day'] <= 23)) ]
df_5 = df_5.assign(DAY_TYPE = 'E')
len(df_5)

22172

In [150]:
df_train = pd.concat([df_1, df_2, df_3, df_4, df_5], axis = 0)
len(df_train)

165388

In [151]:
df = df_train

In [152]:
df = df.drop(columns=['TRIP_ID','MISSING_DATA'])
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,target,year,month,day,hour,weekday
200188,B,,34.0,20000010,1376438529,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,14,1,2
200202,C,,,20000304,1376438909,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,14,1,2
200204,B,,34.0,20000572,1376438510,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,14,1,2
200210,C,,,20000570,1376439240,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,14,1,2
200223,B,,9.0,20000173,1376438918,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,14,1,2


In [153]:
df['TAXI_ID'] -= 20000000

In [168]:
df['quarterHr'] = ((df['hour']-1) / 6).astype(int)

In [169]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [170]:
df['day_A'] = df["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df['day_B'] = df["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df['day_C'] = df["DAY_TYPE"].apply(lambda x : int(x == 'C'))
df['day_D'] = df["DAY_TYPE"].apply(lambda x : int(x == 'D'))
df['day_E'] = df["DAY_TYPE"].apply(lambda x : int(x == 'E'))

train_id_mean = df['TAXI_ID'].mean()
train_id_std = df['TAXI_ID'].std()
df['TAXI_ID'] = (df['TAXI_ID'] - train_id_mean)/ train_id_std

In [171]:
df_slice = df[df['ORIGIN_STAND'].isna()]
stand_nan_mean = df_slice['target'].mean()
stand_nan_mean

718.4076538085938

In [172]:
stands = set()
stand_to_mean = {}
df_temp = df[~df['ORIGIN_STAND'].isna()]
for ind, r in df_temp.iterrows():
    stands.add(r['ORIGIN_STAND'])
for i in stands:
    stand_to_mean[i] = df[df['ORIGIN_STAND'] == i]['target'].mean()

In [173]:
def apply_mean_stand(stand):
    if pd.isna(stand):
        return stand_nan_mean
    else:
        return stand_to_mean[stand]

In [174]:
taxi_ids = set()
for ind, r in df.iterrows():
    taxi_ids.add(r['TAXI_ID'])
taxi_id_to_mean = {}
for i in taxi_ids:
    taxi_id_to_mean[i] = df[df['TAXI_ID'] == i]['target'].mean()

In [175]:
def apply_mean(taxi_id):
    return taxi_id_to_mean[taxi_id]

In [176]:
df['TAXI_ID_MEAN'] = df['TAXI_ID'].apply(apply_mean)
df['STAND_MEAN'] = df['ORIGIN_STAND'].apply(apply_mean_stand)
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,target,year,month,...,call_A,call_B,call_C,day_A,day_B,day_C,day_D,day_E,TAXI_ID_MEAN,STAND_MEAN
200188,B,,34.0,10,1376438529,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,...,0,1,0,1,0,0,0,0,687.091064,671.156128
200202,C,,,304,1376438909,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,...,0,0,1,1,0,0,0,0,699.678162,718.407654
200204,B,,34.0,572,1376438510,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,...,0,1,0,1,0,0,0,0,670.640015,671.156128
200210,C,,,570,1376439240,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,...,0,0,1,1,0,0,0,0,684.03894,718.407654
200223,B,,9.0,173,1376438918,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,...,0,1,0,1,0,0,0,0,588.278809,765.755249


In [177]:
train_id_mean = df['TAXI_ID_MEAN'].mean()
train_id_std = df['TAXI_ID_MEAN'].std()

In [178]:
train_stand_mean = df['STAND_MEAN'].mean()
train_stand_std = df['STAND_MEAN'].std()

In [179]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [180]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [181]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]
train_long_mean = df['Init_longitude'].mean()
train_lat_mean = df['Init_latitude'].mean()
train_long_std = df['Init_longitude'].std()
train_lat_std = df['Init_latitude'].std()
normalize('Init_longitude')
normalize('Init_latitude')
len(df)

165388

In [182]:
normalize('STAND_MEAN')
normalize('TAXI_ID_MEAN')

In [183]:
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,POLYLINE,target,year,month,...,call_C,day_A,day_B,day_C,day_D,day_E,TAXI_ID_MEAN,STAND_MEAN,Init_longitude,Init_latitude
200188,B,,34.0,10,1376438529,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,...,0,1,0,0,0,0,-0.113276,-0.444278,0.223187,0.521428
200202,C,,,304,1376438909,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,...,1,1,0,0,0,0,0.037287,0.381993,1.067421,0.778478
200204,B,,34.0,572,1376438510,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,...,0,1,0,0,0,0,-0.310059,-0.444278,0.226437,0.526508
200210,C,,,570,1376439240,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,...,1,1,0,0,0,0,-0.149785,0.381993,0.433911,0.853316
200223,B,,9.0,173,1376438918,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,...,0,1,0,0,0,0,-1.295242,1.209945,0.594095,0.625186


In [191]:
train_set = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
#,'TAXI_ID_MEAN'
col_list = ['call_B', 'call_C', 'day_A', 'day_B', 'day_C', 'day_D', 'day_E']
#            ,'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    train_set = torch.cat((train_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [200]:
train_set[0]

tensor([0, 1, 0, 1, 0, 0, 0, 0])

In [206]:
col_embed = ['ORIGIN_STAND', 'ORIGIN_CALL', 'quarterHr', 'day', 'weekday']

In [326]:
def apply_embed(x):
    t = torch.reshape(torch.from_numpy(x.values),(-1,1))
    embedding = nn.Embedding(np.max(x.values) + 1, 10)

    res = embedding(t)
    return res

In [327]:
ae = apply_embed(df['TAXI_ID'])
ae1 = apply_embed(df['quarterHr'])
ae2 = apply_embed(df['day'])
ae3 = apply_embed(df['weekday'])

In [328]:
ae[0], ae1[0], ae2[0], ae3[0]

(tensor([[ 2.1649, -1.4727, -0.1742,  0.8401,  0.0595, -1.0268, -1.2760,  1.9809,
           0.9022, -0.4376]], grad_fn=<SelectBackward0>),
 tensor([[ 1.0124, -0.8943, -0.3840,  0.2604,  0.1164,  0.6309, -0.8133, -0.5425,
          -2.7334, -0.5897]], grad_fn=<SelectBackward0>),
 tensor([[-0.0063,  0.3544,  0.3112, -1.2755, -0.3064,  0.3576,  0.1012,  1.8000,
          -0.2142,  0.5763]], grad_fn=<SelectBackward0>),
 tensor([[ 0.3101, -0.6469, -0.5186, -0.3889,  1.1985,  0.3607,  0.4765,  1.9344,
          -1.7221,  1.2430]], grad_fn=<SelectBackward0>))

In [329]:
c = torch.cat((ae,ae1,ae2,ae3), dim = 1)
embed_set = []
for i in range(len(c)):
    embed_set.append(torch.flatten(c[i]))
embed_set[0]

tensor([ 2.1649, -1.4727, -0.1742,  0.8401,  0.0595, -1.0268, -1.2760,  1.9809,
         0.9022, -0.4376,  1.0124, -0.8943, -0.3840,  0.2604,  0.1164,  0.6309,
        -0.8133, -0.5425, -2.7334, -0.5897, -0.0063,  0.3544,  0.3112, -1.2755,
        -0.3064,  0.3576,  0.1012,  1.8000, -0.2142,  0.5763,  0.3101, -0.6469,
        -0.5186, -0.3889,  1.1985,  0.3607,  0.4765,  1.9344, -1.7221,  1.2430],
       grad_fn=<ReshapeAliasBackward0>)

In [None]:
et = torch.stack(embed_set)

In [332]:
c = torch.cat( (train_set, et), dim = 1)
len(c[0]), c[0]

(48,
 tensor([ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          2.1649, -1.4727, -0.1742,  0.8401,  0.0595, -1.0268, -1.2760,  1.9809,
          0.9022, -0.4376,  1.0124, -0.8943, -0.3840,  0.2604,  0.1164,  0.6309,
         -0.8133, -0.5425, -2.7334, -0.5897, -0.0063,  0.3544,  0.3112, -1.2755,
         -0.3064,  0.3576,  0.1012,  1.8000, -0.2142,  0.5763,  0.3101, -0.6469,
         -0.5186, -0.3889,  1.1985,  0.3607,  0.4765,  1.9344, -1.7221,  1.2430],
        grad_fn=<SelectBackward0>))

In [351]:
train_set = c

In [364]:
train_set = train_set.type('torch.FloatTensor')
train_set = train_set.clone().detach()

In [353]:
target_set = torch.reshape(torch.tensor(df['target'].values),(-1,1))

In [354]:
train_set[0]

tensor([ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         2.1649, -1.4727, -0.1742,  0.8401,  0.0595, -1.0268, -1.2760,  1.9809,
         0.9022, -0.4376,  1.0124, -0.8943, -0.3840,  0.2604,  0.1164,  0.6309,
        -0.8133, -0.5425, -2.7334, -0.5897, -0.0063,  0.3544,  0.3112, -1.2755,
        -0.3064,  0.3576,  0.1012,  1.8000, -0.2142,  0.5763,  0.3101, -0.6469,
        -0.5186, -0.3889,  1.1985,  0.3607,  0.4765,  1.9344, -1.7221,  1.2430],
       grad_fn=<SelectBackward0>)

In [355]:
class MLP_Regressor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim = 1):
        super(MLP_Regressor, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = 1
                
        self.fc = torch.nn.Sequential(
            nn.Dropout(),
            nn.Linear(self.in_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(self.hidden_dim, self.hidden_dim * 2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(self.hidden_dim * 2, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.out_dim)
        )

    
    def forward(self, x):
        x = self.fc(x)
        x = x.squeeze(1)
        return x

In [356]:
targets = torch.reshape(torch.from_numpy(df['target'].values),(-1,1))

In [357]:
len(targets)

165388

In [366]:
batch_size = 64
lst_all = list(zip(train_set, targets))
np.random.shuffle(lst_all)
lst_train = lst_all[:int(len(lst_all) * 0.8)]
lst_valid = lst_all[int(len(lst_all) * 0.8):]
len_train = len(lst_train)
print(len(lst_train))
print(len(lst_valid))
#print(lst_train[0])
trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(lst_valid , batch_size=batch_size, shuffle=False, num_workers=2)

132310
33078


In [367]:
def validate(dataloader, model, criterion):
    valid_losses = []
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            loss = get_loss(X, y, model, criterion)
            valid_losses.append(loss.item())
    
    return np.mean(valid_losses)

In [368]:
mlp_model = MLP_Regressor(48, 64, 1).to(device)
lr_mlp = 5e-6
optimizer = torch.optim.Adam(mlp_model.parameters(), lr = lr_mlp)
criterion = nn.MSELoss()

In [369]:
def get_loss(X, y, model, criterion):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    return torch.sqrt(loss)

In [370]:
models = []

In [371]:
from tqdm import tqdm
for epoch in tqdm(range(5)):

    running_loss = 0.0
    losses = []
    for X, y in trainloader:
        X, y = X.to(device), y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        loss = get_loss(X, y, mlp_model, criterion)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        running_loss += loss.cpu().detach().numpy()

    models.append(mlp_model)
    print(f'RMSE: {running_loss / len(losses)}')
    print(f'Total loss: {running_loss}')
print('Finished Training')
v_loss = validate(validloader, mlp_model, criterion)
print(f'Valid RMSE: {v_loss}')

 20%|██        | 1/5 [00:23<01:33, 23.47s/it]

RMSE: 806.6586873628185
Total loss: 1668170.1654663086


 40%|████      | 2/5 [00:47<01:10, 23.52s/it]

RMSE: 803.4854707404309
Total loss: 1661607.953491211


 60%|██████    | 3/5 [01:09<00:46, 23.08s/it]

RMSE: 793.624596011016
Total loss: 1641215.6645507812


 80%|████████  | 4/5 [01:32<00:23, 23.10s/it]

RMSE: 772.3105236474042
Total loss: 1597138.162902832


100%|██████████| 5/5 [01:56<00:00, 23.21s/it]

RMSE: 735.608919302315
Total loss: 1521239.2451171875
Finished Training





Valid RMSE: 705.4257298954682


In [372]:
for mlp_model in models:
    v_loss = validate(validloader, mlp_model, criterion)
    print(f'Valid RMSE: {v_loss}')

Valid RMSE: 705.4046585997945
Valid RMSE: 705.7106848593146
Valid RMSE: 705.134582755644
Valid RMSE: 705.217207475142
Valid RMSE: 705.6088426837147


In [209]:
mlp_model = models[4]

In [56]:
model_scripted = torch.jit.script(mlp_model)
#model_scripted.save('whyworking.pt')

In [57]:
#m = torch.jit.load('whyworking.pt')
#m.eval()

In [511]:
param_grid = {
    'n_estimators': [50, 100, 300, 500],
    'num_leaves': [17, 31, 45],
    'max_depth': [-1, 5, 10, 15],
    'learning_rate': [1e-3, 1e-2, 0.1, 0.2]
}
lgbm = lgb.LGBMRegressor()
lgbmcv = GridSearchCV(lgbm, param_grid = param_grid, scoring = 'neg_root_mean_squared_error', cv = 5)
lgbmcv.fit(train_set, target_set)

GridSearchCV(cv=5, estimator=LGBMRegressor(),
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 0.2],
                         'max_depth': [-1, 5, 10, 15],
                         'n_estimators': [50, 100, 300, 500],
                         'num_leaves': [17, 31, 45]},
             scoring='neg_root_mean_squared_error')

In [512]:
best_lgbm = lgbmcv.best_estimator_
pred = best_lgbm.predict(train_set)
targets = df['target'].values
np.sqrt(np.mean((pred-targets)**2))

374.4366066672457

In [409]:
stands = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
stand_dict = {}
stands.head()
for r, s in stands.iterrows():
    stand_dict[s['ID']] = (np.float32(s['Longitude']), np.float32(s['Latitude']))

In [410]:
df_test = pd.read_csv("test_public.csv")

In [411]:
df_test = df_test.drop(columns=['ORIGIN_CALL','MISSING_DATA'])
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE
0,T1,B,15.0,20000542,1408039037,A
1,T2,B,57.0,20000108,1408038611,A
2,T3,B,15.0,20000370,1408038568,A
3,T4,B,53.0,20000492,1408039090,A
4,T5,B,18.0,20000621,1408039177,A


In [412]:
times = df_test['TIMESTAMP'].apply(parse_time)
df_test['year'] = [x for x,y,z,w,a in times]
df_test['month'] = [y for x,y,z,w,a in times]
df_test['day'] = [z for x,y,z,w,a in times]
df_test['hour'] = [w for x,y,z,w,a in times]
df_test['weekday'] = [a for x,y,z,w,a in times]
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,weekday
0,T1,B,15.0,20000542,1408039037,A,2014,8,14,18,3
1,T2,B,57.0,20000108,1408038611,A,2014,8,14,18,3
2,T3,B,15.0,20000370,1408038568,A,2014,8,14,18,3
3,T4,B,53.0,20000492,1408039090,A,2014,8,14,18,3
4,T5,B,18.0,20000621,1408039177,A,2014,8,14,18,3


In [413]:
len(df_test)

320

In [414]:
df_test1 = df_test[(df_test['day'] == 14) & (df_test['month'] == 8)]
#df_test1['DAY_TYPE'] = 'C'
#8/14, one day before holiday, 18pm
print(len(df_test1))
df_test1 = df_test1.assign(DAY_TYPE = 'A')
#df_test1.head()

74


In [415]:
df_test2 = df_test[ ((df_test['day'] == 30) & (df_test['month'] == 9))] 
print(len(df_test2))
df_test2 = df_test2.assign(DAY_TYPE = 'B')
#9/30, normal weekday 7-9am

77


In [416]:
df_test3 = df_test[(df_test['day'] == 6) & (df_test['month'] == 10)]
print(len(df_test3))
df_test3 = df_test3.assign(DAY_TYPE = 'C')
#10/6, normal weekday, 18pm

77


In [417]:
df_test4 = df_test[(df_test['day'] == 1) & (df_test['month'] == 11)]
print(len(df_test4))
df_test4 = df_test4.assign(DAY_TYPE = 'D')

#11/1, Saturday 4am (??)

62


In [418]:
df_test5 = df_test[(df_test['day'] == 21) & (df_test['month'] == 12)]
print(len(df_test5))
df_test5 = df_test5.assign(DAY_TYPE = 'E')

#12/21, sunday 3pm before christmas

30


In [419]:
df_test = pd.concat([df_test1, df_test2, df_test3, df_test4, df_test5], axis = 0)
df_test

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,weekday
0,T1,B,15.0,20000542,1408039037,A,2014,8,14,18,3
1,T2,B,57.0,20000108,1408038611,A,2014,8,14,18,3
2,T3,B,15.0,20000370,1408038568,A,2014,8,14,18,3
3,T4,B,53.0,20000492,1408039090,A,2014,8,14,18,3
4,T5,B,18.0,20000621,1408039177,A,2014,8,14,18,3
...,...,...,...,...,...,...,...,...,...,...,...
315,T323,A,,20000430,1419171485,E,2014,12,21,15,6
316,T324,B,53.0,20000020,1419170802,E,2014,12,21,15,6
317,T325,C,,20000207,1419172121,E,2014,12,21,15,6
318,T326,A,,20000667,1419171980,E,2014,12,21,15,6


In [420]:

df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_A'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df_test['day_B'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df_test['day_C'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_D'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'D'))
df_test['day_E'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'E'))


In [421]:
df_test['STAND_MEAN'] = df_test['ORIGIN_STAND'].apply(apply_mean_stand)
df_test['STAND_MEAN'] = np.float32((df_test['STAND_MEAN'] - train_stand_mean)/train_stand_std)

In [422]:
def apply_mean_test(taxi_id):
    if taxi_id in taxi_id_to_mean:
        return taxi_id_to_mean[taxi_id]
    else:
        return train_id_mean

In [423]:

df_test['TAXI_ID_MEAN'] = df_test['TAXI_ID'].apply(apply_mean_test)
df_test['TAXI_ID_MEAN'] = np.float32((df_test['TAXI_ID_MEAN'] - train_id_mean)/train_id_std)

In [424]:
def apply_long(x): #in: origin stand
    if not pd.isna(x):
        return stand_dict[x][0]
    else:
        return x

In [425]:
def apply_lat(x):
    if not pd.isna(x):
        return stand_dict[x][1]
    else:
        return x

In [426]:
df_test['Init_longitude'] = df_test['ORIGIN_STAND'].apply(apply_long)
df_test['Init_latitude'] = df_test['ORIGIN_STAND'].apply(apply_lat)

In [427]:
df_test['quarterHr'] = ((df_test['hour']-1) / 6).astype(int)
df_test['TAXI_ID'] -= 20000000


In [428]:
df_test[:10]

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,...,day_A,day_B,day_C,day_D,day_E,STAND_MEAN,TAXI_ID_MEAN,Init_longitude,Init_latitude,quarterHr
0,T1,B,15.0,542,1408039037,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,-8.585876,41.148628,2
1,T2,B,57.0,108,1408038611,A,2014,8,14,18,...,1,0,0,0,0,-1.140498,0.0,-8.610707,41.145718,2
2,T3,B,15.0,370,1408038568,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,-8.585876,41.148628,2
3,T4,B,53.0,492,1408039090,A,2014,8,14,18,...,1,0,0,0,0,-1.079227,0.0,-8.614013,41.141209,2
4,T5,B,18.0,621,1408039177,A,2014,8,14,18,...,1,0,0,0,0,0.548437,0.0,-8.619603,41.148319,2
5,T6,A,,607,1408037146,A,2014,8,14,18,...,1,0,0,0,0,0.381993,0.0,,,2
6,T7,B,15.0,310,1408038846,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,-8.585876,41.148628,2
7,T8,A,,619,1408038948,A,2014,8,14,18,...,1,0,0,0,0,0.381993,0.0,,,2
8,T9,B,9.0,503,1408038563,A,2014,8,14,18,...,1,0,0,0,0,1.209945,0.0,-8.60572,41.144253,2
9,T10,B,15.0,327,1408038021,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,-8.585876,41.148628,2


In [429]:
for i, row in df_test.iterrows():
    if pd.isna(row['ORIGIN_STAND']):
        if row['DAY_TYPE'] == 'A':
            df_t = df_1[ ((df_1['day'] == 14) & (df_1['month'] == 8))]
            df_t = df_t[df_t['hour'] == 18]
        elif row['DAY_TYPE'] == 'B':
            df_t = df_2[((df_2['day'] == 30) & (df_2['month'] == 9))]
            df_t = df_2[df_2['hour'] == 9]

        elif row['DAY_TYPE'] == 'C':
            df_t = df_3[(df_3['day'] == 6) & (df_3['month'] == 10)]
            df_t = df_3[df_3['hour'] == 18]

        elif row['DAY_TYPE'] == 'D':
            df_t = df_4[(df_4['day'] == 1) & (df_4['month'] == 11)]
            df_t = df_4[df_4['hour'] == 4]

        else:
            df_t = df_5[(df_5['day'] == 21) & (df_5['month'] == 12)]
            df_t = df_5[df_5['hour'] == 15]

        df_t['Init_longitude'] = df_t['POLYLINE'].apply(lambda x : parse_num(x,0))
        df_t['Init_latitude'] = df_t['POLYLINE'].apply(lambda x : parse_num(x,1))
        s = df_t.sample()
        df_test.at[i,'Init_longitude'] = s['Init_longitude']
        df_test.at[i,'Init_latitude'] = s['Init_latitude']

In [430]:

df_test['Init_longitude'] = (df_test['Init_longitude'] - train_long_mean) / train_long_std
df_test['Init_latitude'] = (df_test['Init_latitude'] - train_lat_mean) / train_lat_std

In [431]:
df_test[:10]

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,...,day_A,day_B,day_C,day_D,day_E,STAND_MEAN,TAXI_ID_MEAN,Init_longitude,Init_latitude,quarterHr
0,T1,B,15.0,542,1408039037,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,1.418373,0.726306,2
1,T2,B,57.0,108,1408038611,A,2014,8,14,18,...,1,0,0,0,0,-1.140498,0.0,0.422977,0.65176,2
2,T3,B,15.0,370,1408038568,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,1.418373,0.726306,2
3,T4,B,53.0,492,1408039090,A,2014,8,14,18,...,1,0,0,0,0,-1.079227,0.0,0.290472,0.536278,2
4,T5,B,18.0,621,1408039177,A,2014,8,14,18,...,1,0,0,0,0,0.548437,0.0,0.066367,0.718392,2
5,T6,A,,607,1408037146,A,2014,8,14,18,...,1,0,0,0,0,0.381993,0.0,-0.748047,1.010126,2
6,T7,B,15.0,310,1408038846,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,1.418373,0.726306,2
7,T8,A,,619,1408038948,A,2014,8,14,18,...,1,0,0,0,0,0.381993,0.0,1.427128,0.724254,2
8,T9,B,9.0,503,1408038563,A,2014,8,14,18,...,1,0,0,0,0,1.209945,0.0,0.62292,0.614243,2
9,T10,B,15.0,327,1408038021,A,2014,8,14,18,...,1,0,0,0,0,2.162526,0.0,1.418373,0.726306,2


In [432]:

#    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C', 'day_A', 'day_B', 'day_C', 'day_D', 'day_E']
            #,'Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
#col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [433]:
def apply_embed_test(x, i):
    t = torch.reshape(torch.from_numpy(x.values),(-1,1))
    embedding = nn.Embedding(i+1, 10)

    res = embedding(t)
    return res

In [434]:
ae = apply_embed_test(df_test['TAXI_ID'], df['TAXI_ID'].max())
ae1 = apply_embed_test(df_test['quarterHr'], df['quarterHr'].max())
ae2 = apply_embed_test(df_test['day'], df['day'].max())
ae3 = apply_embed_test(df_test['weekday'], df['weekday'].max())

In [435]:
c = torch.cat((ae,ae1,ae2,ae3), dim = 1)
embed_set = []
for i in range(len(c)):
    embed_set.append(torch.flatten(c[i]))
et = torch.stack(embed_set)
c = torch.cat( (test_set, et), dim = 1)
len(c[0]), c[0]


(48,
 tensor([ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         -0.2789, -0.5563, -0.8106, -0.8256, -1.1711, -2.4556, -0.0807, -1.7410,
          0.1912,  0.3398, -0.3311, -1.6760,  0.4539,  0.7891,  0.8486, -0.9809,
          0.5788,  0.6922, -0.1762, -0.7043, -1.5716,  1.2989,  0.4992,  0.9439,
          0.1908,  0.6702,  0.8191, -1.1565,  1.4124, -1.8307, -0.0453,  1.0519,
          0.7215, -0.2864,  2.7228, -0.5146,  1.1997, -0.4587, -0.6121, -0.0514],
        grad_fn=<SelectBackward0>))

In [436]:
test_set = c
test_set = test_set.type('torch.FloatTensor')
test_set = test_set.clone().detach()

In [437]:
test_set[0]

tensor([ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,
        -0.2789, -0.5563, -0.8106, -0.8256, -1.1711, -2.4556, -0.0807, -1.7410,
         0.1912,  0.3398, -0.3311, -1.6760,  0.4539,  0.7891,  0.8486, -0.9809,
         0.5788,  0.6922, -0.1762, -0.7043, -1.5716,  1.2989,  0.4992,  0.9439,
         0.1908,  0.6702,  0.8191, -1.1565,  1.4124, -1.8307, -0.0453,  1.0519,
         0.7215, -0.2864,  2.7228, -0.5146,  1.1997, -0.4587, -0.6121, -0.0514])

In [438]:
preds = mlp_model(test_set.to(device)).cpu().detach().numpy()
#preds = best_lgbm.predict(test_set)

In [439]:
def round_to_multiple(number, multiple):
    return multiple * round(number / multiple)

In [440]:
for i in range(len(preds)):
    preds[i] = round_to_multiple(preds[i], 15)

In [441]:
output_csv = pd.read_csv("sampleSubmission.csv")
output_csv['TRAVEL_TIME'] = preds
#output_csv['TRAVEL_TIME'] = preds
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)


In [442]:
#visualizing
#lst = [(1,2), (3,4)]
#plt.plot(lst)