In [290]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')

In [291]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [292]:
df = pd.read_csv("train.csv")

In [293]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [294]:
df['ORIGIN_STAND'].isna().sum() / len(df)

0.5285011135987654

In [295]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))

In [296]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [297]:
df['day_A'] = df["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df['day_B'] = df["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df['day_C'] = df["DAY_TYPE"].apply(lambda x : int(x == 'C'))

In [298]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]


In [299]:
mean, std = df["target"].mean(), df["target"].std()
median = df["target"].median()
print(f"{mean=} {median=} {std=}")
df = df[df["target"] < mean + 1.5 * std]


mean=732.0843505859375 median=615.0 std=683.1688232421875


In [300]:
def one_hot(str):
    if str == 'A':
        return [1,0,0]
    elif str == 'B':
        return [0,1,0]
    elif str == 'C':
        return [0,0,1]
    else:
        return None

In [301]:
from datetime import datetime
def parse_hour(x):
    dt = datetime.fromtimestamp(x)
    return dt.hour

In [302]:
def parse_day(x):
    dt = datetime.fromtimestamp(x)
    return dt.weekday()

In [303]:
df['TIME'] = df['TIMESTAMP'].apply(parse_hour)
df['DAY'] = df['TIMESTAMP'].apply(parse_day)

In [304]:
df_slice = df[df['ORIGIN_STAND'].isna()]
stand_nan_mean = df_slice['target'].mean()
stand_nan_mean

671.5037231445312

In [305]:
taxi_ids = set()
for ind, r in df.iterrows():
    taxi_ids.add(r['TAXI_ID'])
taxi_id_to_mean = {}
for i in taxi_ids:
    taxi_id_to_mean[i] = df[df['TAXI_ID'] == i]['target'].mean()

In [306]:
stands = set()
stand_to_mean = {}
df_temp = df[~df['ORIGIN_STAND'].isna()]
for ind, r in df_temp.iterrows():
    stands.add(r['ORIGIN_STAND'])
for i in stands:
    stand_to_mean[i] = df[df['ORIGIN_STAND'] == i]['target'].mean()

In [307]:
def apply_mean(taxi_id):
    return taxi_id_to_mean[taxi_id]

In [308]:
def apply_mean_stand(stand):
    if pd.isna(stand):
        return stand_nan_mean
    else:
        return stand_to_mean[stand]

In [309]:
pd.isna(df.loc[0]['ORIGIN_STAND'])

True

In [310]:
df['TAXI_ID_MEAN'] = df['TAXI_ID'].apply(apply_mean)
df['STAND_MEAN'] = df['ORIGIN_STAND'].apply(apply_mean_stand)
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,call_A,call_B,call_C,day_A,day_B,day_C,TIME,DAY,TAXI_ID_MEAN,STAND_MEAN
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,0,0,1,1,0,0,17,6,651.581116,671.503723
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,0,1,0,1,0,0,17,6,664.299683,628.093567
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,0,0,1,1,0,0,17,6,631.768311,671.503723
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,0,0,1,1,0,0,17,6,588.854858,671.503723
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,0,0,1,1,0,0,17,6,663.555298,671.503723


In [311]:
#df['mon'] = df["DAY"].apply(lambda x : int(x == 0))
#df['tues'] = df["DAY"].apply(lambda x : int(x == 1))
#df['wednes'] = df["DAY"].apply(lambda x : int(x == 2))
#df['thurs'] = df["DAY"].apply(lambda x : int(x == 3))
#df['fri'] = df["DAY"].apply(lambda x : int(x == 4))
#df['satur'] = df["DAY"].apply(lambda x : int(x == 5))
#df['sun'] = df["DAY"].apply(lambda x : int(x == 6))
df['weekday'] = df["DAY"].apply(lambda x : int(x <= 4))
#df['weekend'] = df["DAY"].apply(lambda x : int(x >= 5))

In [312]:
def apply_rush_hour(x):
    if (8 <= x and x <= 10) or (17 <= x and x <= 19):
        return 1
    return 0

In [313]:
df['rush'] = df['TIME'].apply(apply_rush_hour)

In [314]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [315]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [316]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]

In [317]:
train_long_mean = df['Init_longitude'].mean()
train_lat_mean = df['Init_latitude'].mean()
train_long_std = df['Init_longitude'].std()
train_lat_std = df['Init_latitude'].std()

train_taxi_mean = df['TAXI_ID_MEAN'].mean()
train_taxi_std = df['TAXI_ID_MEAN'].std()

train_stand_mean = df['STAND_MEAN'].mean()
train_stand_std = df['STAND_MEAN'].std()

In [318]:
normalize('Init_longitude')
normalize('Init_latitude')
normalize('TAXI_ID_MEAN')
normalize('STAND_MEAN')

In [319]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,...,day_B,day_C,TIME,DAY,TAXI_ID_MEAN,STAND_MEAN,weekday,rush,Init_longitude,Init_latitude
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,...,0,0,17,6,-0.017379,0.419243,0,1,0.959561,0.959297
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,...,0,0,17,6,0.203464,-0.540655,0,1,0.770695,1.005467
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,...,0,0,17,6,-0.361406,0.419243,0,1,1.010145,0.956657
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,...,0,0,17,6,-1.10655,0.419243,0,1,1.351156,0.985725
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,...,0,0,17,6,0.190539,0.419243,0,1,0.715939,1.057281


In [320]:
train_set = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
#col_list = ['call_B', 'call_C','day_A', 'day_B','day_C','TIME','DAY','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
col_list = ['call_B', 'call_C','day_A', 'day_B','day_C','rush','weekday','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN', 'STAND_MEAN']
for col_name in col_list:
#     print(train_set.dtype)
    train_set = torch.cat((train_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [321]:
train_set = train_set.type('torch.FloatTensor')

In [322]:
target_set = torch.reshape(torch.tensor(df['target'].values),(-1,1))

In [323]:
train_set[0]

tensor([ 0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
         0.9596,  0.9593, -0.0174,  0.4192])

train_set_linear = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    train_set_linear = torch.cat((train_set_linear,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

train_set_linear = train_set_linear.type('torch.FloatTensor')

model = torch.nn.Sequential(
  torch.nn.Linear(6, 1),
).to(device)

# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 5e-3
opt = torch.optim.SGD(model.parameters(), lr=lr)
# torch.optim.Adam

train_err = []
test_err = []
parameters = []
for i in range(10):
    model.train()

    y_pred = model(train_set_linear.to(device)) # Compute model outputs
    loss = loss_fn(y_pred, target_set.to(device)) # Compute MSE
    opt.zero_grad() # Must reset the gradients every step. Otherwise, gradients from previous iterations would cause interference!!!
    loss.backward() # Compute gradients of all parameters (our model) with respect to our computed loss value (a singular value)
    opt.step() # One gradient step

    train_err.append(loss.item())

    model.eval()
#     with torch.no_grad():
#     test_err.append(loss_fn(model(X_test), y_test).item())

model[0].weight

In [None]:
class MLP_Regressor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim = 1):
        super(MLP_Regressor, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        
        out_dim = 1
        
        self.model = torch.nn.Sequential( #an ordered container of modules
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
        )   
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim
        self.linear1 = nn.Linear(self.in_dim, self.hidden_dim)
        self.linear2 = nn.Linear(self.hidden_dim, self.out_dim)
        
                
        self.fc = torch.nn.Sequential(
            nn.Linear(self.in_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.hidden_dim * 4),
            nn.ReLU(),
            nn.Linear(self.hidden_dim * 4, self.hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim * 2, self.out_dim)
        )

    
    def forward(self, x):
        x = self.fc(x)
        x = x.squeeze(1)
        return x

In [None]:
targets_mlp = torch.reshape(torch.from_numpy(df['target'].values),(-1,1))

In [None]:
targets_mlp[:5]

In [None]:
batch_size = 1024
lst_train = list(zip(train_set, targets_mlp))
len_train = len(lst_train)
print(len(lst_train))
print(lst_train[0])
trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
mlp_model = MLP_Regressor(9, 16, 1).to(device)
lr_mlp = 1e-5
optimizer = torch.optim.Adam(mlp_model.parameters(), lr = lr_mlp)
criterion = nn.MSELoss()

In [None]:
def get_loss(X, y, model, criterion):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    return loss

In [None]:
losses = []

In [None]:
from tqdm import tqdm
epoch_loss = []
for epoch in tqdm(range(15)):

    running_loss = 0.0

    for X, y in trainloader:
        X, y = X.to(device), y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        loss = get_loss(X, y, mlp_model, criterion)
        #RMSE
        loss = torch.sqrt(loss)
        losses.append(loss)
        loss.backward()
        optimizer.step()

        running_loss += loss.cpu().detach().numpy()

    epoch_loss.append(running_loss)
    print(f'Avg loss: {running_loss / batch_size}')
    print(f'Total loss: {running_loss}')
print('Finished Training')

In [324]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, validation_curve
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt

In [327]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    #'num_leaves': [17, 31, 45],
    'max_depth': [-1, 5, 10, 15],
    'learning_rate': [1e-3, 1e-2, 0.1, 0.15, 0.2]
}
lgbm = lgb.LGBMRegressor()
lgbmcv = GridSearchCV(lgbm, param_grid = param_grid, scoring = 'neg_root_mean_squared_error', cv = 5)
lgbmcv.fit(train_set, target_set)

GridSearchCV(cv=5, estimator=LGBMRegressor(),
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 0.15, 0.2],
                         'max_depth': [-1, 5, 10, 15],
                         'n_estimators': [50, 100, 200, 300]},
             scoring='neg_root_mean_squared_error')

In [328]:
best_lgbm = lgbmcv.best_estimator_
best_lgbm.fit(train_set, target_set)

LGBMRegressor(learning_rate=0.2, max_depth=15, n_estimators=300)

In [329]:
#best_lgbm = lgb.LGBMRegressor(n_estimators=200)
#best_lgbm.fit(train_set, target_set)
pred = best_lgbm.predict(train_set)
targets = df['target'].values
np.sqrt(np.mean((pred-targets)**2))

311.3749623580363

In [330]:
stands = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
stand_dict = {}
stands.head()
for r, s in stands.iterrows():
    stand_dict[s['ID']] = (np.float32(s['Longitude']), np.float32(s['Latitude']))

In [331]:
df_test = pd.read_csv("test_public.csv")
df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_A'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df_test['day_B'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df_test['day_C'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'C'))
#df_test['TAXI_ID'] = (df_test['TAXI_ID'] - train_id_mean)/train_id_std
df_test['TIME'] = df_test['TIMESTAMP'].apply(parse_hour)
df_test['DAY'] = df_test['TIMESTAMP'].apply(parse_day)

In [332]:
df_test['weekday'] = df_test["DAY"].apply(lambda x : int(x <= 4))
df_test['rush'] = df_test['TIME'].apply(apply_rush_hour)

In [333]:
def apply_mean_test(taxi_id):
    if taxi_id in taxi_id_to_mean:
        return taxi_id_to_mean[taxi_id]
    else:
        return train_id_mean

In [334]:
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,call_A,call_B,call_C,day_A,day_B,day_C,TIME,DAY,weekday,rush
0,T1,B,,15.0,20000542,1408039037,A,False,0,1,0,1,0,0,10,3,1,1
1,T2,B,,57.0,20000108,1408038611,A,False,0,1,0,1,0,0,10,3,1,1
2,T3,B,,15.0,20000370,1408038568,A,False,0,1,0,1,0,0,10,3,1,1
3,T4,B,,53.0,20000492,1408039090,A,False,0,1,0,1,0,0,10,3,1,1
4,T5,B,,18.0,20000621,1408039177,A,False,0,1,0,1,0,0,10,3,1,1


In [335]:
def apply_long(x): #in: origin stand
    if not pd.isna(x):
        return stand_dict[x][0]
    else:
        return train_long_mean

In [336]:
def apply_lat(x):
    if not pd.isna(x):
        return stand_dict[x][1]
    else:
        return train_lat_mean

In [337]:
df_test['Init_longitude'] = df_test['ORIGIN_STAND'].apply(apply_long)
df_test['Init_latitude'] = df_test['ORIGIN_STAND'].apply(apply_lat)

In [338]:
df_test['Init_longitude'] = (df_test['Init_longitude'] - train_long_mean) / train_long_std
df_test['Init_latitude'] = (df_test['Init_latitude'] - train_lat_mean) / train_lat_std

In [339]:
df_test['TAXI_ID_MEAN'] = df_test['TAXI_ID'].apply(apply_mean_test)
df_test['STAND_MEAN'] = df_test['ORIGIN_STAND'].apply(apply_mean_stand)
df_test['TAXI_ID_MEAN'] = np.float32((df_test['TAXI_ID_MEAN'] - train_taxi_mean)/train_taxi_std)
df_test['STAND_MEAN'] = np.float32((df_test['STAND_MEAN'] - train_stand_mean)/train_stand_std)

In [340]:
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,call_A,call_B,...,day_B,day_C,TIME,DAY,weekday,rush,Init_longitude,Init_latitude,TAXI_ID_MEAN,STAND_MEAN
0,T1,B,,15.0,20000542,1408039037,A,False,0,1,...,0,0,10,3,1,1,1.251414,0.977394,0.168395,1.832971
1,T2,B,,57.0,20000108,1408038611,A,False,0,1,...,0,0,10,3,1,1,1.030243,0.970096,-0.104095,-0.539274
2,T3,B,,15.0,20000370,1408038568,A,False,0,1,...,0,0,10,3,1,1,1.251414,0.977394,-0.909764,1.832971
3,T4,B,,53.0,20000492,1408039090,A,False,0,1,...,0,0,10,3,1,1,1.000801,0.95879,-0.981362,-1.109691
4,T5,B,,18.0,20000621,1408039177,A,False,0,1,...,0,0,10,3,1,1,0.951007,0.976619,-0.912844,0.744761


In [341]:
#    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','day_A', 'day_B','day_C','rush','weekday','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN', 'STAND_MEAN']
for col_name in col_list:
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [342]:
test_set[:5]

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  1.0000,
          1.2514,  0.9774,  0.1684,  1.8330],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  1.0000,
          1.0302,  0.9701, -0.1041, -0.5393],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  1.0000,
          1.2514,  0.9774, -0.9098,  1.8330],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  1.0000,
          1.0008,  0.9588, -0.9814, -1.1097],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  1.0000,
          0.9510,  0.9766, -0.9128,  0.7448]])

In [343]:
test_set = test_set
preds = best_lgbm.predict(test_set)
#preds = mlp_model(test_set.to(device))


In [344]:
output_csv = pd.read_csv("sampleSubmission.csv")
output_csv['TRAVEL_TIME'] = preds
#output_csv['TRAVEL_TIME'] = preds.cpu().detach().numpy()
#output_csv['TRAVEL_TIME'] = preds_taxi
#df_test["TRAVEL_TIME"] = 716.43
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)
