In [2]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, validation_curve
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv("train.csv")

In [5]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))
len(df)

1710670

In [6]:
mean, std = df["target"].mean(), df["target"].std()
median = df["target"].median()
#df = df[df["target"] < mean + 3 * std]
df = df[df["target"] < 15 * 185]
df = df[df["target"] >= 4]
len(df)

1656261

In [7]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]
len(df)

1656255

In [8]:
from datetime import datetime
def parse_time(x):
    dt = datetime.utcfromtimestamp(x)
    return dt.year, dt.month, dt.day, dt.hour+1, dt.weekday() #monday: 0, sunday: 6

In [9]:
times = df['TIMESTAMP'].apply(parse_time)
df['year'] = [x for x,y,z,w,a in times]
df['month'] = [y for x,y,z,w,a in times]
df['day'] = [z for x,y,z,w,a in times]
df['hour'] = [w for x,y,z,w,a in times]
df['weekday'] = [a for x,y,z,w,a in times]
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,year,month,day,hour,weekday
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,2013,7,1,1,0
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,2013,7,1,1,0
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,2013,7,1,1,0
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,2013,7,1,1,0
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,2013,7,1,1,0


In [10]:
len(df[df['TAXI_ID'] - 20000000 > 1000])

0

In [11]:
df_1 = df[ ((df['day'] == 14) & (df['month'] == 8)) |
          ((df['day'] == 4) & (df['month'] == 10)) |
          ((df['day'] == 9) & (df['month'] == 6)) 
         ]
print(len(df_1))
df_1_1 = df_1[df_1['hour'] >= 12]
print(len(df_1_1))
#plt.plot(df_1_1['target'].tolist())
df_1_1['target'].describe()
df_1 = df_1.assign(DAY_TYPE = 'A')

14556
9206


In [12]:
df_2 = df[ ((df['weekday'] <= 4)) ]
df_2 = df_2[df_2['hour'] >= 8]
df_2 = df_2[df_2['hour'] <= 9]
df_2 = df_2.assign(DAY_TYPE = 'B')
len(df_2)

125048

In [13]:
df_3 = df[ ((df['weekday'] <= 4)) ]
df_3 = df_3[df_3['hour'] >= 17]
df_3 = df_3[df_3['hour'] <= 18]
df_3 = df_3.assign(DAY_TYPE = 'C')
len(df_3)

132626

In [14]:
df_4 = df[ ((df['weekday'] > 4)) ]
df_4 = df_4[df_4['hour'] >= 0]
df_4 = df_4[df_4['hour'] <= 5]
df_4 = df_4.assign(DAY_TYPE = 'D')
len(df_4)

140971

In [15]:
df_5 = df[ ((df['day'] >= 20) & (df['month'] == 12) & (df['day'] <= 23)) ]
df_5 = df_5.assign(DAY_TYPE = 'E')
len(df_5)

22172

In [16]:
df_train = pd.concat([df_1, df_2, df_3, df_4, df_5], axis = 0)
len(df_train)

435373

In [17]:
df = df_train

In [18]:
df = df.drop(columns=['TRIP_ID', 'ORIGIN_CALL', 'TIMESTAMP', 'MISSING_DATA'])
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_STAND,TAXI_ID,DAY_TYPE,POLYLINE,target,year,month,day,hour,weekday
200188,B,34.0,20000010,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,14,1,2
200202,C,,20000304,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,14,1,2
200204,B,34.0,20000572,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,14,1,2
200210,C,,20000570,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,14,1,2
200223,B,9.0,20000173,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,14,1,2


In [19]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [20]:
df['day_A'] = df["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df['day_B'] = df["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df['day_C'] = df["DAY_TYPE"].apply(lambda x : int(x == 'C'))
df['day_D'] = df["DAY_TYPE"].apply(lambda x : int(x == 'D'))
df['day_E'] = df["DAY_TYPE"].apply(lambda x : int(x == 'E'))

train_id_mean = df['TAXI_ID'].mean()
train_id_std = df['TAXI_ID'].std()
df['TAXI_ID'] = (df['TAXI_ID'] - train_id_mean)/ train_id_std

In [21]:
df_slice = df[df['ORIGIN_STAND'].isna()]
stand_nan_mean = df_slice['target'].mean()
stand_nan_mean

738.8215942382812

In [22]:
stands = set()
stand_to_mean = {}
df_temp = df[~df['ORIGIN_STAND'].isna()]
for ind, r in df_temp.iterrows():
    stands.add(r['ORIGIN_STAND'])
for i in stands:
    stand_to_mean[i] = df[df['ORIGIN_STAND'] == i]['target'].mean()

In [23]:
def apply_mean_stand(stand):
    if pd.isna(stand):
        return stand_nan_mean
    else:
        return stand_to_mean[stand]

In [24]:
taxi_ids = set()
for ind, r in df.iterrows():
    taxi_ids.add(r['TAXI_ID'])
taxi_id_to_mean = {}
for i in taxi_ids:
    taxi_id_to_mean[i] = df[df['TAXI_ID'] == i]['target'].mean()

In [25]:
def apply_mean(taxi_id):
    return taxi_id_to_mean[taxi_id]

In [26]:
df['TAXI_ID_MEAN'] = df['TAXI_ID'].apply(apply_mean)
df['STAND_MEAN'] = df['ORIGIN_STAND'].apply(apply_mean_stand)
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_STAND,TAXI_ID,DAY_TYPE,POLYLINE,target,year,month,day,hour,...,call_A,call_B,call_C,day_A,day_B,day_C,day_D,day_E,TAXI_ID_MEAN,STAND_MEAN
200188,B,34.0,20000010,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,14,1,...,0,1,0,1,0,0,0,0,696.652283,669.9776
200202,C,,20000304,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,14,1,...,0,0,1,1,0,0,0,0,706.098267,738.821594
200204,B,34.0,20000572,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,14,1,...,0,1,0,1,0,0,0,0,679.791077,669.9776
200210,C,,20000570,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,14,1,...,0,0,1,1,0,0,0,0,690.751526,738.821594
200223,B,9.0,20000173,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,14,1,...,0,1,0,1,0,0,0,0,615.996399,785.105835


In [27]:
train_id_mean = df['TAXI_ID_MEAN'].mean()
train_id_std = df['TAXI_ID_MEAN'].std()

In [28]:
train_stand_mean = df['STAND_MEAN'].mean()
train_stand_std = df['STAND_MEAN'].std()

In [29]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [30]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [31]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]
train_long_mean = df['Init_longitude'].mean()
train_lat_mean = df['Init_latitude'].mean()
train_long_std = df['Init_longitude'].std()
train_lat_std = df['Init_latitude'].std()
normalize('Init_longitude')
normalize('Init_latitude')
len(df)

435373

In [32]:
normalize('STAND_MEAN')
normalize('TAXI_ID_MEAN')

In [33]:
df.head()

Unnamed: 0,CALL_TYPE,ORIGIN_STAND,TAXI_ID,DAY_TYPE,POLYLINE,target,year,month,day,hour,...,call_C,day_A,day_B,day_C,day_D,day_E,TAXI_ID_MEAN,STAND_MEAN,Init_longitude,Init_latitude
200188,B,34.0,20000010,A,"[[-8.615691,41.140629],[-8.615385,41.140764],[...",480.0,2013,8,14,1,...,0,1,0,0,0,0,-0.233646,-0.773771,-0.592812,0.644744
200202,C,,20000304,A,"[[-8.594631,41.150664],[-8.595468,41.15016],[-...",300.0,2013,8,14,1,...,1,1,0,0,0,0,-0.118268,0.388918,0.01775,0.837508
200204,B,34.0,20000572,A,"[[-8.61561,41.140827],[-8.615592,41.140809],[-...",420.0,2013,8,14,1,...,0,1,0,0,0,0,-0.439596,-0.773771,-0.590462,0.648554
200210,C,,20000570,A,"[[-8.610435,41.153589],[-8.611047,41.153634],[...",465.0,2013,8,14,1,...,1,1,0,0,0,0,-0.30572,0.388918,-0.440413,0.89363
200223,B,9.0,20000173,A,"[[-8.606439,41.144679],[-8.60643,41.14467],[-8...",450.0,2013,8,14,1,...,0,1,0,0,0,0,-1.218813,1.170601,-0.324566,0.722553


In [34]:
train_set = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
#,'TAXI_ID_MEAN'
col_list = ['call_B', 'call_C', 'day_A', 'day_B', 'day_C', 'day_D', 'day_E'
            ,'Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
for col_name in col_list:
#     print(train_set.dtype)
    train_set = torch.cat((train_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [35]:
train_set = train_set.type('torch.FloatTensor')

In [36]:
target_set = torch.reshape(torch.tensor(df['target'].values),(-1,1))

In [37]:
train_set[0]

tensor([ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,
        -0.5928,  0.6447, -0.2336])

In [38]:
class MLP_Regressor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim = 1):
        super(MLP_Regressor, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = 1
                
        self.fc = torch.nn.Sequential(
            nn.Linear(self.in_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim * 2, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.out_dim)
        )

    
    def forward(self, x):
        x = self.fc(x)
        x = x.squeeze(1)
        return x

In [39]:
targets = torch.reshape(torch.from_numpy(df['target'].values),(-1,1))

In [40]:
len(targets)

435373

In [42]:
batch_size = 128
lst_all = list(zip(train_set, targets))
np.random.shuffle(lst_all)
lst_train = lst_all[:int(len(lst_all) * 0.9)]
lst_valid = lst_all[int(len(lst_all) * 0.9):]
len_train = len(lst_train)
print(len(lst_train))
print(len(lst_valid))
#print(lst_train[0])
trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(lst_valid , batch_size=batch_size, shuffle=False, num_workers=2)

391835
43538


In [49]:
mlp_model = MLP_Regressor(11, 64, 1).to(device)
lr_mlp = 5e-6
optimizer = torch.optim.Adam(mlp_model.parameters(), lr = lr_mlp)
criterion = nn.MSELoss()

In [50]:
def get_loss(X, y, model, criterion):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    return torch.sqrt(loss)

In [51]:
models = []

In [53]:
def validate(dataloader, model, criterion):
    valid_losses = []
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            loss = get_loss(X, y, model, criterion)
            valid_losses.append(loss.item())
    
    return np.mean(valid_losses)

In [52]:
from tqdm import tqdm
for epoch in tqdm(range(5)):

    running_loss = 0.0
    losses = []
    for X, y in trainloader:
        X, y = X.to(device), y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        loss = get_loss(X, y, mlp_model, criterion)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        running_loss += loss.cpu().detach().numpy()

    models.append(torch.jit.script(mlp_model))
    print(f'RMSE: {running_loss / len(losses)}')
    print(f'Total loss: {running_loss}')
print('Finished Training')


 20%|██        | 1/5 [00:46<03:05, 46.47s/it]

RMSE: 825.6171279000271
Total loss: 2528039.645629883


 40%|████      | 2/5 [01:32<02:18, 46.07s/it]

RMSE: 820.5610161799375
Total loss: 2512557.8315429688


 60%|██████    | 3/5 [02:20<01:34, 47.20s/it]

RMSE: 806.010960753956
Total loss: 2468005.5618286133


 80%|████████  | 4/5 [03:08<00:47, 47.27s/it]

RMSE: 776.2574492590946
Total loss: 2376900.3096313477


100%|██████████| 5/5 [03:54<00:00, 46.80s/it]

RMSE: 725.939240458742
Total loss: 2222825.954284668
Finished Training





NameError: name 'validate' is not defined

In [54]:
v_loss = validate(validloader, mlp_model, criterion)
print(f'Valid RMSE: {v_loss}')

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x0000022FEEBDB9D0>
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "C:\ProgramData\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 1437, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'
  return F.mse_loss(input, target, reduction=self.reduction)


Valid RMSE: 693.2578767569534


In [42]:
model_scripted = torch.jit.script(mlp_model)
#model_scripted.save('whyworking.pt')

In [None]:
#mlp_model = torch.jit.load('mlp_model.pt')
#mlp_model.eval()

In [240]:
import warnings
warnings.filterwarnings('ignore')

param_grid = {
    'n_estimators': [50, 100, 300, 500],
    #'num_leaves': [17, 31, 45],
    'max_depth': [-1, 5, 10, 15],
    'learning_rate': [1e-3, 1e-2, 0.1, 0.2]
}
lgbm = lgb.LGBMRegressor()
lgbmcv = GridSearchCV(lgbm, param_grid = param_grid, scoring = 'neg_root_mean_squared_error', cv = 8)
lgbmcv.fit(train_set, target_set)

best_lgbm = lgbmcv.best_estimator_
pred = best_lgbm.predict(train_set)
targets = df['target'].values
np.sqrt(np.mean((pred-targets)**2))

In [55]:
stands = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
stand_dict = {}
stands.head()
for r, s in stands.iterrows():
    stand_dict[s['ID']] = (np.float32(s['Longitude']), np.float32(s['Latitude']))

In [56]:
df_test = pd.read_csv("test_public.csv")

In [57]:
df_test = df_test.drop(columns=['ORIGIN_CALL','MISSING_DATA'])
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE
0,T1,B,15.0,20000542,1408039037,A
1,T2,B,57.0,20000108,1408038611,A
2,T3,B,15.0,20000370,1408038568,A
3,T4,B,53.0,20000492,1408039090,A
4,T5,B,18.0,20000621,1408039177,A


In [58]:
len(df_test['TAXI_ID'].unique())

244

In [59]:
parse_time(1408039037)

(2014, 8, 14, 18, 3)

In [60]:
times = df_test['TIMESTAMP'].apply(parse_time)
df_test['year'] = [x for x,y,z,w,a in times]
df_test['month'] = [y for x,y,z,w,a in times]
df_test['day'] = [z for x,y,z,w,a in times]
df_test['hour'] = [w for x,y,z,w,a in times]
df_test['weekday'] = [a for x,y,z,w,a in times]
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,weekday
0,T1,B,15.0,20000542,1408039037,A,2014,8,14,18,3
1,T2,B,57.0,20000108,1408038611,A,2014,8,14,18,3
2,T3,B,15.0,20000370,1408038568,A,2014,8,14,18,3
3,T4,B,53.0,20000492,1408039090,A,2014,8,14,18,3
4,T5,B,18.0,20000621,1408039177,A,2014,8,14,18,3


In [61]:
len(df_test)

320

In [62]:
df_test1 = df_test[(df_test['day'] == 14) & (df_test['month'] == 8)]
#df_test1['DAY_TYPE'] = 'C'
#8/14, one day before holiday, 18pm
print(len(df_test1))
df_test1 = df_test1.assign(DAY_TYPE = 'A')
#df_test1.head()

74


In [63]:
df_test2 = df_test[ ((df_test['day'] == 30) & (df_test['month'] == 9))] 
print(len(df_test2))
df_test2 = df_test2.assign(DAY_TYPE = 'B')
#9/30, normal weekday 7-9am

77


In [64]:
df_test3 = df_test[(df_test['day'] == 6) & (df_test['month'] == 10)]
print(len(df_test3))
df_test3 = df_test3.assign(DAY_TYPE = 'C')
#10/6, normal weekday, 18pm

77


In [65]:
df_test4 = df_test[(df_test['day'] == 1) & (df_test['month'] == 11)]
print(len(df_test4))
df_test4 = df_test4.assign(DAY_TYPE = 'D')

#11/1, Saturday 4am (??)

62


In [66]:
df_test5 = df_test[(df_test['day'] == 21) & (df_test['month'] == 12)]
print(len(df_test5))
df_test5 = df_test5.assign(DAY_TYPE = 'E')

#12/21, sunday 3pm before christmas

30


In [67]:
df_test = pd.concat([df_test1, df_test2, df_test3, df_test4, df_test5], axis = 0)
df_test

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,weekday
0,T1,B,15.0,20000542,1408039037,A,2014,8,14,18,3
1,T2,B,57.0,20000108,1408038611,A,2014,8,14,18,3
2,T3,B,15.0,20000370,1408038568,A,2014,8,14,18,3
3,T4,B,53.0,20000492,1408039090,A,2014,8,14,18,3
4,T5,B,18.0,20000621,1408039177,A,2014,8,14,18,3
...,...,...,...,...,...,...,...,...,...,...,...
315,T323,A,,20000430,1419171485,E,2014,12,21,15,6
316,T324,B,53.0,20000020,1419170802,E,2014,12,21,15,6
317,T325,C,,20000207,1419172121,E,2014,12,21,15,6
318,T326,A,,20000667,1419171980,E,2014,12,21,15,6


In [68]:

df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_A'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df_test['day_B'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df_test['day_C'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_D'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'D'))
df_test['day_E'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'E'))


In [69]:
df_test['STAND_MEAN'] = df_test['ORIGIN_STAND'].apply(apply_mean_stand)
df_test['STAND_MEAN'] = np.float32((df_test['STAND_MEAN'] - train_stand_mean)/train_stand_std)

In [70]:
def apply_mean_test(taxi_id):
    if taxi_id in taxi_id_to_mean:
        return taxi_id_to_mean[taxi_id]
    else:
        return train_id_mean

In [71]:

df_test['TAXI_ID_MEAN'] = df_test['TAXI_ID'].apply(apply_mean_test)
df_test['TAXI_ID_MEAN'] = np.float32((df_test['TAXI_ID_MEAN'] - train_id_mean)/train_id_std)

In [72]:
def apply_long(x): #in: origin stand
    if not pd.isna(x):
        return stand_dict[x][0]
    else:
        return x

In [73]:
def apply_lat(x):
    if not pd.isna(x):
        return stand_dict[x][1]
    else:
        return x

In [74]:
df_test['Init_longitude'] = df_test['ORIGIN_STAND'].apply(apply_long)
df_test['Init_latitude'] = df_test['ORIGIN_STAND'].apply(apply_lat)

In [75]:
df_test[:10]

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,...,call_C,day_A,day_B,day_C,day_D,day_E,STAND_MEAN,TAXI_ID_MEAN,Init_longitude,Init_latitude
0,T1,B,15.0,20000542,1408039037,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,0.099967,-8.585876,41.148628
1,T2,B,57.0,20000108,1408038611,A,2014,8,14,18,...,0,1,0,0,0,0,-0.921834,-0.30738,-8.610707,41.145718
2,T3,B,15.0,20000370,1408038568,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,-0.754649,-8.585876,41.148628
3,T4,B,53.0,20000492,1408039090,A,2014,8,14,18,...,0,1,0,0,0,0,-0.922573,-1.020312,-8.614013,41.141209
4,T5,B,18.0,20000621,1408039177,A,2014,8,14,18,...,0,1,0,0,0,0,1.134218,-0.788176,-8.619603,41.148319
5,T6,A,,20000607,1408037146,A,2014,8,14,18,...,0,1,0,0,0,0,0.388918,2.51091,,
6,T7,B,15.0,20000310,1408038846,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,-0.653464,-8.585876,41.148628
7,T8,A,,20000619,1408038948,A,2014,8,14,18,...,0,1,0,0,0,0,0.388918,0.596003,,
8,T9,B,9.0,20000503,1408038563,A,2014,8,14,18,...,0,1,0,0,0,0,1.170601,-1.229139,-8.60572,41.144253
9,T10,B,15.0,20000327,1408038021,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,0.632277,-8.585876,41.148628


In [76]:
for i, row in df_test.iterrows():
    if pd.isna(row['ORIGIN_STAND']):
        if row['DAY_TYPE'] == 'A':
            df_t = df_1[ ((df_1['day'] == 14) & (df_1['month'] == 8))]
            df_t = df_t[df_t['hour'] == 18]
        elif row['DAY_TYPE'] == 'B':
            df_t = df_2[((df_2['day'] == 30) & (df_2['month'] == 9))]
            df_t = df_2[df_2['hour'] == 9]

        elif row['DAY_TYPE'] == 'C':
            df_t = df_3[(df_3['day'] == 6) & (df_3['month'] == 10)]
            df_t = df_3[df_3['hour'] == 18]

        elif row['DAY_TYPE'] == 'D':
            df_t = df_4[(df_4['day'] == 1) & (df_4['month'] == 11)]
            df_t = df_4[df_4['hour'] == 4]

        else:
            df_t = df_5[(df_5['day'] == 21) & (df_5['month'] == 12)]
            df_t = df_5[df_5['hour'] == 15]

        df_t['Init_longitude'] = df_t['POLYLINE'].apply(lambda x : parse_num(x,0))
        df_t['Init_latitude'] = df_t['POLYLINE'].apply(lambda x : parse_num(x,1))
        s = df_t.sample() #change to mean
        df_test.at[i,'Init_longitude'] = s['Init_longitude']
        df_test.at[i,'Init_latitude'] = s['Init_latitude']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t['Init_longitude'] = df_t['POLYLINE'].apply(lambda x : parse_num(x,0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t['Init_latitude'] = df_t['POLYLINE'].apply(lambda x : parse_num(x,1))


In [77]:

df_test['Init_longitude'] = (df_test['Init_longitude'] - train_long_mean) / train_long_std
df_test['Init_latitude'] = (df_test['Init_latitude'] - train_lat_mean) / train_lat_std

In [78]:
df_test[:10]

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,year,month,day,hour,...,call_C,day_A,day_B,day_C,day_D,day_E,STAND_MEAN,TAXI_ID_MEAN,Init_longitude,Init_latitude
0,T1,B,15.0,20000542,1408039037,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,0.099967,0.271564,0.798384
1,T2,B,57.0,20000108,1408038611,A,2014,8,14,18,...,0,1,0,0,0,0,-0.921834,-0.30738,-0.448321,0.742481
2,T3,B,15.0,20000370,1408038568,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,-0.754649,0.271564,0.798384
3,T4,B,53.0,20000492,1408039090,A,2014,8,14,18,...,0,1,0,0,0,0,-0.922573,-1.020312,-0.54415,0.655881
4,T5,B,18.0,20000621,1408039177,A,2014,8,14,18,...,0,1,0,0,0,0,1.134218,-0.788176,-0.706226,0.792449
5,T6,A,,20000607,1408037146,A,2014,8,14,18,...,0,1,0,0,0,0,0.388918,2.51091,-0.713083,0.786515
6,T7,B,15.0,20000310,1408038846,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,-0.653464,0.271564,0.798384
7,T8,A,,20000619,1408038948,A,2014,8,14,18,...,0,1,0,0,0,0,0.388918,0.596003,-0.638736,0.93737
8,T9,B,9.0,20000503,1408038563,A,2014,8,14,18,...,0,1,0,0,0,0,1.170601,-1.229139,-0.303719,0.714347
9,T10,B,15.0,20000327,1408038021,A,2014,8,14,18,...,0,1,0,0,0,0,2.324633,0.632277,0.271564,0.798384


In [79]:

#    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C', 'day_A', 'day_B', 'day_C', 'day_D', 'day_E'
            ,'Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
#col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [80]:
test_set[0]

tensor([0.0000, 1.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2716,
        0.7984, 0.1000])

In [81]:
preds = mlp_model(test_set.to(device))
#preds = best_lgbm.predict(test_set)

In [82]:
output_csv = pd.read_csv("sampleSubmission.csv")
output_csv['TRAVEL_TIME'] = preds.cpu().detach().numpy()
#output_csv['TRAVEL_TIME'] = preds
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)


In [83]:
#visualizing
#lst = [(1,2), (3,4)]
#plt.plot(lst)