In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("train.csv")

In [4]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [5]:
df['ORIGIN_STAND'].isna().sum() / len(df)

0.5285011135987654

In [6]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))

In [7]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [8]:
df['day_A'] = df["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df['day_B'] = df["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df['day_C'] = df["DAY_TYPE"].apply(lambda x : int(x == 'C'))

In [9]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]


In [10]:
mean, std = df["target"].mean(), df["target"].std()
median = df["target"].median()
print(f"{mean=} {median=} {std=}")
#df = df[df["target"] < mean + 3 * std]


mean=732.0843505859375 median=615.0 std=683.1688232421875


In [11]:
def one_hot(str):
    if str == 'A':
        return [1,0,0]
    elif str == 'B':
        return [0,1,0]
    elif str == 'C':
        return [0,0,1]
    else:
        return None

In [12]:
from datetime import datetime
def parse_hour(x):
    dt = datetime.fromtimestamp(x)
    return dt.hour

In [13]:
def parse_day(x):
    dt = datetime.fromtimestamp(x)
    return dt.weekday()

In [14]:
df['TIME'] = df['TIMESTAMP'].apply(parse_hour)
df['DAY'] = df['TIMESTAMP'].apply(parse_day)

In [15]:
df_slice = df[df['ORIGIN_STAND'].isna()]
stand_nan_mean = df_slice['target'].mean()
stand_nan_mean

787.2562866210938

In [16]:
taxi_ids = set()
for ind, r in df.iterrows():
    taxi_ids.add(r['TAXI_ID'])
taxi_id_to_mean = {}
for i in taxi_ids:
    taxi_id_to_mean[i] = df[df['TAXI_ID'] == i]['target'].mean()

In [17]:
stands = set()
stand_to_mean = {}
df_temp = df[~df['ORIGIN_STAND'].isna()]
for ind, r in df_temp.iterrows():
    stands.add(r['ORIGIN_STAND'])
for i in stands:
    stand_to_mean[i] = df[df['ORIGIN_STAND'] == i]['target'].mean()

In [18]:
def apply_mean(taxi_id):
    return taxi_id_to_mean[taxi_id]

In [19]:
def apply_mean_stand(stand):
    if pd.isna(stand):
        return stand_nan_mean
    else:
        return stand_to_mean[stand]

In [20]:
pd.isna(df.loc[0]['ORIGIN_STAND'])

True

In [21]:
df['TAXI_ID_MEAN'] = df['TAXI_ID'].apply(apply_mean)
df['STAND_MEAN'] = df['ORIGIN_STAND'].apply(apply_mean_stand)
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,call_A,call_B,call_C,day_A,day_B,day_C,TIME,DAY,TAXI_ID_MEAN,STAND_MEAN
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,0,0,1,1,0,0,17,6,694.473022,787.256287
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,0,1,0,1,0,0,17,6,728.770874,656.767151
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,0,0,1,1,0,0,17,6,707.499146,787.256287
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,0,0,1,1,0,0,17,6,808.640808,787.256287
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,0,0,1,1,0,0,17,6,758.926819,787.256287


In [22]:
#df['mon'] = df["DAY"].apply(lambda x : int(x == 0))
#df['tues'] = df["DAY"].apply(lambda x : int(x == 1))
#df['wednes'] = df["DAY"].apply(lambda x : int(x == 2))
#df['thurs'] = df["DAY"].apply(lambda x : int(x == 3))
#df['fri'] = df["DAY"].apply(lambda x : int(x == 4))
#df['satur'] = df["DAY"].apply(lambda x : int(x == 5))
#df['sun'] = df["DAY"].apply(lambda x : int(x == 6))
df['weekday'] = df["DAY"].apply(lambda x : int(x <= 4))
df['weekend'] = df["DAY"].apply(lambda x : int(x >= 5))

In [23]:
def apply_rush_hour(x):
    if (8 <= x and x <= 10) or (17 <= x and x <= 19):
        return 1
    return 0

In [24]:
def apply_no_rush(x):
    if (8 <= x and x <= 10) or (17 <= x and x <= 19):
        return 0
    return 1

In [25]:
df['rush'] = df['TIME'].apply(apply_rush_hour)
df['no_rush'] = df['TIME'].apply(apply_no_rush)

In [26]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [27]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [28]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]

In [29]:
train_long_mean = df['Init_longitude'].mean()
train_lat_mean = df['Init_latitude'].mean()
train_long_std = df['Init_longitude'].std()
train_lat_std = df['Init_latitude'].std()

train_taxi_mean = df['TAXI_ID_MEAN'].mean()
train_taxi_std = df['TAXI_ID_MEAN'].std()

train_stand_mean = df['STAND_MEAN'].mean()
train_stand_std = df['STAND_MEAN'].std()

In [30]:
normalize('Init_longitude')
normalize('Init_latitude')
normalize('TAXI_ID_MEAN')
normalize('STAND_MEAN')

In [31]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,...,TIME,DAY,TAXI_ID_MEAN,STAND_MEAN,weekday,weekend,rush,no_rush,Init_longitude,Init_latitude
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,...,17,6,-0.269866,0.743967,0,1,1,0,0.96109,0.961183
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,...,17,6,-0.023518,-1.012761,0,1,1,0,0.786753,1.004535
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,...,17,6,-0.176305,0.743967,0,1,1,0,1.007784,0.958705
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,...,17,6,0.550158,0.743967,0,1,1,0,1.322562,0.985998
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,...,17,6,0.193081,0.743967,0,1,1,0,0.73621,1.053185


In [32]:
train_set = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
#col_list = ['call_B', 'call_C','day_A', 'day_B','day_C','TIME','DAY','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
col_list = ['call_B', 'call_C','day_A', 'day_B','day_C','rush', 'no_rush','weekday','weekend','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN', 'STAND_MEAN']
for col_name in col_list:
#     print(train_set.dtype)
    train_set = torch.cat((train_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [33]:
train_set = train_set.type('torch.FloatTensor')

In [34]:
target_set = torch.reshape(torch.tensor(df['target'].values),(-1,1))

In [35]:
train_set[0]

tensor([ 0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
         0.0000,  1.0000,  0.9611,  0.9612, -0.2699,  0.7440])

In [36]:
len(train_set[0])

14

train_set_linear = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    train_set_linear = torch.cat((train_set_linear,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

train_set_linear = train_set_linear.type('torch.FloatTensor')

model = torch.nn.Sequential(
  torch.nn.Linear(6, 1),
).to(device)

# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 5e-3
opt = torch.optim.SGD(model.parameters(), lr=lr)
# torch.optim.Adam

train_err = []
test_err = []
parameters = []
for i in range(10):
    model.train()

    y_pred = model(train_set_linear.to(device)) # Compute model outputs
    loss = loss_fn(y_pred, target_set.to(device)) # Compute MSE
    opt.zero_grad() # Must reset the gradients every step. Otherwise, gradients from previous iterations would cause interference!!!
    loss.backward() # Compute gradients of all parameters (our model) with respect to our computed loss value (a singular value)
    opt.step() # One gradient step

    train_err.append(loss.item())

    model.eval()
#     with torch.no_grad():
#     test_err.append(loss_fn(model(X_test), y_test).item())

model[0].weight

In [86]:
class MLP_Regressor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim = 1):
        super(MLP_Regressor, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        
        out_dim = 1
        
        self.model = torch.nn.Sequential( #an ordered container of modules
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
        )   
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim
        self.linear1 = nn.Linear(self.in_dim, self.hidden_dim)
        self.linear2 = nn.Linear(self.hidden_dim, self.out_dim)
        
                
        self.fc = torch.nn.Sequential(
            nn.Linear(self.in_dim, self.hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim * 2, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.in_dim), 
            nn.ReLU(),
            nn.Linear(self.in_dim, self.out_dim)
        )

    
    def forward(self, x):
        x = self.fc(x)
        x = x.squeeze(1)
        return x

In [87]:
targets_mlp = torch.reshape(torch.from_numpy(df['target'].values),(-1,1))

In [93]:
batch_size = 252
lst_train = list(zip(train_set, targets_mlp))
len_train = len(lst_train)
print(len(lst_train))
print(lst_train[0])
trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=True, num_workers=2)

1674152
(tensor([ 0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
         0.0000,  1.0000,  0.9611,  0.9612, -0.2699,  0.7440]), tensor([330.]))


In [94]:
mlp_model = MLP_Regressor(14, 16, 1).to(device)
lr_mlp = 5e-6
optimizer = torch.optim.Adam(mlp_model.parameters(), lr = lr_mlp)
criterion = nn.MSELoss()

In [95]:
def get_loss(X, y, model, criterion):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    loss = torch.sqrt(loss)
    return loss

In [96]:
losses = []

In [None]:
from tqdm import tqdm
epoch_loss = []
for epoch in tqdm(range(10)):

    running_loss = 0.0

    for X, y in trainloader:
        X, y = X.to(device), y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        loss = get_loss(X, y, mlp_model, criterion)
        #RMSE
        losses.append(loss)
        loss.backward()
        optimizer.step()

        running_loss += loss.cpu().detach().numpy()

    epoch_loss.append(running_loss)
    print(f'RMSE: {running_loss * batch_size / len(train_set)}')
    print(f'Total loss: {running_loss}')
print('Finished Training')

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, validation_curve
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    #'num_leaves': [17, 31, 45],
    'max_depth': [-1, 5, 10, 15],
    'learning_rate': [1e-3, 1e-2, 0.1, 0.15, 0.2]
}
lgbm = lgb.LGBMRegressor()
lgbmcv = GridSearchCV(lgbm, param_grid = param_grid, scoring = 'neg_root_mean_squared_error', cv = 5)
lgbmcv.fit(train_set, target_set)

In [None]:
best_lgbm = lgbmcv.best_estimator_
best_lgbm.fit(train_set, target_set)

In [69]:
#best_lgbm = lgb.LGBMRegressor(n_estimators=200)
#best_lgbm.fit(train_set, target_set)
pred = best_lgbm.predict(train_set)
targets = df['target'].values
np.sqrt(np.mean((pred-targets)**2))

NameError: name 'best_lgbm' is not defined

In [71]:
stands = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
stand_dict = {}
stands.head()
for r, s in stands.iterrows():
    stand_dict[s['ID']] = (np.float32(s['Longitude']), np.float32(s['Latitude']))

In [72]:
df_test = pd.read_csv("test_public.csv")
df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_A'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df_test['day_B'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df_test['day_C'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'C'))
#df_test['TAXI_ID'] = (df_test['TAXI_ID'] - train_id_mean)/train_id_std
df_test['TIME'] = df_test['TIMESTAMP'].apply(parse_hour)
df_test['DAY'] = df_test['TIMESTAMP'].apply(parse_day)

In [73]:
df_test['weekday'] = df_test["DAY"].apply(lambda x : int(x <= 4))
df_test['weekend'] = df_test["DAY"].apply(lambda x : int(x >= 5))
df_test['rush'] = df_test['TIME'].apply(apply_rush_hour)
df_test['no_rush'] = df_test['TIME'].apply(apply_no_rush)

In [74]:
def apply_mean_test(taxi_id):
    if taxi_id in taxi_id_to_mean:
        return taxi_id_to_mean[taxi_id]
    else:
        return train_id_mean

In [75]:
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,call_A,call_B,call_C,day_A,day_B,day_C,TIME,DAY,weekday,weekend,rush,no_rush
0,T1,B,,15.0,20000542,1408039037,A,False,0,1,0,1,0,0,10,3,1,0,1,0
1,T2,B,,57.0,20000108,1408038611,A,False,0,1,0,1,0,0,10,3,1,0,1,0
2,T3,B,,15.0,20000370,1408038568,A,False,0,1,0,1,0,0,10,3,1,0,1,0
3,T4,B,,53.0,20000492,1408039090,A,False,0,1,0,1,0,0,10,3,1,0,1,0
4,T5,B,,18.0,20000621,1408039177,A,False,0,1,0,1,0,0,10,3,1,0,1,0


In [76]:
def apply_long(x): #in: origin stand
    if not pd.isna(x):
        return stand_dict[x][0]
    else:
        return train_long_mean

In [77]:
def apply_lat(x):
    if not pd.isna(x):
        return stand_dict[x][1]
    else:
        return train_lat_mean

In [78]:
df_test['Init_longitude'] = df_test['ORIGIN_STAND'].apply(apply_long)
df_test['Init_latitude'] = df_test['ORIGIN_STAND'].apply(apply_lat)

In [79]:
df_test['Init_longitude'] = (df_test['Init_longitude'] - train_long_mean) / train_long_std
df_test['Init_latitude'] = (df_test['Init_latitude'] - train_lat_mean) / train_lat_std

In [80]:
df_test['TAXI_ID_MEAN'] = df_test['TAXI_ID'].apply(apply_mean_test)
df_test['STAND_MEAN'] = df_test['ORIGIN_STAND'].apply(apply_mean_stand)
df_test['TAXI_ID_MEAN'] = np.float32((df_test['TAXI_ID_MEAN'] - train_taxi_mean)/train_taxi_std)
df_test['STAND_MEAN'] = np.float32((df_test['STAND_MEAN'] - train_stand_mean)/train_stand_std)

In [81]:
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,call_A,call_B,...,TIME,DAY,weekday,weekend,rush,no_rush,Init_longitude,Init_latitude,TAXI_ID_MEAN,STAND_MEAN
0,T1,B,,15.0,20000542,1408039037,A,False,0,1,...,10,3,1,0,1,0,1.230492,0.978175,0.083549,0.780853
1,T2,B,,57.0,20000108,1408038611,A,False,0,1,...,10,3,1,0,1,0,1.026335,0.971323,-0.262844,-0.792781
2,T3,B,,15.0,20000370,1408038568,A,False,0,1,...,10,3,1,0,1,0,1.230492,0.978175,-0.737079,0.780853
3,T4,B,,53.0,20000492,1408039090,A,False,0,1,...,10,3,1,0,1,0,0.999158,0.960707,-0.676633,-1.26664
4,T5,B,,18.0,20000621,1408039177,A,False,0,1,...,10,3,1,0,1,0,0.953194,0.977448,-0.687932,-0.097428


In [82]:
#    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','day_A', 'day_B','day_C','rush', 'no_rush','weekday','weekend','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN', 'STAND_MEAN']
for col_name in col_list:
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [83]:
test_set[:5]

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          1.0000,  0.0000,  1.2305,  0.9782,  0.0835,  0.7809],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          1.0000,  0.0000,  1.0263,  0.9713, -0.2628, -0.7928],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          1.0000,  0.0000,  1.2305,  0.9782, -0.7371,  0.7809],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          1.0000,  0.0000,  0.9992,  0.9607, -0.6766, -1.2666],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  0.0000,
          1.0000,  0.0000,  0.9532,  0.9774, -0.6879, -0.0974]])

In [84]:
test_set = test_set
#preds = best_lgbm.predict(test_set)
preds = mlp_model(test_set.to(device))


In [85]:
output_csv = pd.read_csv("sampleSubmission.csv")
#output_csv['TRAVEL_TIME'] = preds
output_csv['TRAVEL_TIME'] = preds.cpu().detach().numpy()
#output_csv['TRAVEL_TIME'] = preds_taxi
#df_test["TRAVEL_TIME"] = 716.43
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)
