In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("../train.csv")

In [4]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [5]:
df['ORIGIN_STAND'].isna().sum() / len(df)

0.5285011135987654

In [6]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))

In [7]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [8]:
df['day_A'] = df["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df['day_B'] = df["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df['day_C'] = df["DAY_TYPE"].apply(lambda x : int(x == 'C'))

In [9]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]


In [10]:
mean, std = df["target"].mean(), df["target"].std()
median = df["target"].median()
print(f"{mean=} {median=} {std=}")
#df = df[df["target"] < mean + 3 * std]


mean=732.0452 median=615.0 std=683.1688232421875


In [11]:
def one_hot(str):
    if str == 'A':
        return [1,0,0]
    elif str == 'B':
        return [0,1,0]
    elif str == 'C':
        return [0,0,1]
    else:
        return None

In [12]:
from datetime import datetime
def parse_hour(x):
    dt = datetime.fromtimestamp(x)
    return dt.hour

In [13]:
def parse_month(x):
    dt = datetime.fromtimestamp(x)
    return dt.month

In [14]:
def parse_year(x):
    dt = datetime.fromtimestamp(x)
    return dt.year

In [15]:
def parse_day(x):
    dt = datetime.fromtimestamp(x)
    return dt.weekday()

In [16]:
df['TIME'] = df['TIMESTAMP'].apply(parse_hour)
df['DAY'] = df['TIMESTAMP'].apply(parse_day)
df['MONTH'] = df['TIMESTAMP'].apply(parse_month)
df['YEAR'] = df['TIMESTAMP'].apply(parse_year)

In [17]:
df_slice = df[df['ORIGIN_STAND'].isna()]
stand_nan_mean = df_slice['target'].mean()
stand_nan_mean

787.3528

In [18]:
taxi_ids = set()
for ind, r in df.iterrows():
    taxi_ids.add(r['TAXI_ID'])
taxi_id_to_mean = {}
for i in taxi_ids:
    taxi_id_to_mean[i] = df[df['TAXI_ID'] == i]['target'].mean()

In [19]:
stands = set()
stand_to_mean = {}
df_temp = df[~df['ORIGIN_STAND'].isna()]
for ind, r in df_temp.iterrows():
    stands.add(r['ORIGIN_STAND'])
for i in stands:
    stand_to_mean[i] = df[df['ORIGIN_STAND'] == i]['target'].mean()

In [20]:
def apply_mean(taxi_id):
    return taxi_id_to_mean[taxi_id]

In [21]:
def apply_mean_stand(stand):
    if pd.isna(stand):
        return stand_nan_mean
    else:
        return stand_to_mean[stand]

In [22]:
pd.isna(df.loc[0]['ORIGIN_STAND'])

True

In [23]:
df['TAXI_ID_MEAN'] = df['TAXI_ID'].apply(apply_mean)
df['STAND_MEAN'] = df['ORIGIN_STAND'].apply(apply_mean_stand)
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,...,call_C,day_A,day_B,day_C,TIME,DAY,MONTH,YEAR,TAXI_ID_MEAN,STAND_MEAN
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,...,1,1,0,0,0,0,7,2013,694.473022,787.352783
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,...,0,1,0,0,0,0,7,2013,728.770874,656.767151
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,...,1,1,0,0,0,0,7,2013,707.499146,787.352783
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,...,1,1,0,0,0,0,7,2013,808.640808,787.352783
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,...,1,1,0,0,0,0,7,2013,758.926819,787.352783


In [24]:
df['DAY_TYPE'].unique()

array(['A'], dtype=object)

In [25]:
#df['mon'] = df["DAY"].apply(lambda x : int(x == 0))
#df['tues'] = df["DAY"].apply(lambda x : int(x == 1))
#df['wednes'] = df["DAY"].apply(lambda x : int(x == 2))
#df['thurs'] = df["DAY"].apply(lambda x : int(x == 3))
#df['fri'] = df["DAY"].apply(lambda x : int(x == 4))
#df['satur'] = df["DAY"].apply(lambda x : int(x == 5))
#df['sun'] = df["DAY"].apply(lambda x : int(x == 6))
df['weekday'] = df["DAY"].apply(lambda x : int(x <= 4))
df['weekend'] = df["DAY"].apply(lambda x : int(x >= 5))

In [26]:
def apply_rush_hour(x):
    if (8 <= x and x <= 10) or (17 <= x and x <= 19):
        return 1
    return 0

In [27]:
def apply_no_rush(x):
    if (8 <= x and x <= 10) or (17 <= x and x <= 19):
        return 0
    return 1

In [28]:
df['rush'] = df['TIME'].apply(apply_rush_hour)
df['no_rush'] = df['TIME'].apply(apply_no_rush)

In [29]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [30]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [31]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]

In [32]:
train_long_mean = df['Init_longitude'].mean()
train_lat_mean = df['Init_latitude'].mean()
train_long_std = df['Init_longitude'].std()
train_lat_std = df['Init_latitude'].std()

train_month_mean = df['MONTH'].mean()
train_year_mean = df['YEAR'].mean()
train_month_std = df['MONTH'].std()
train_year_std = df['YEAR'].std()

train_taxi_mean = df['TAXI_ID_MEAN'].mean()
train_taxi_std = df['TAXI_ID_MEAN'].std()

train_stand_mean = df['STAND_MEAN'].mean()
train_stand_std = df['STAND_MEAN'].std()

In [33]:
normalize('Init_longitude')
normalize('Init_latitude')
normalize('TAXI_ID_MEAN')
normalize('STAND_MEAN')
normalize('YEAR')
normalize('MONTH')

In [34]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,...,MONTH,YEAR,TAXI_ID_MEAN,STAND_MEAN,weekday,weekend,rush,no_rush,Init_longitude,Init_latitude
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,...,0.119626,-0.993774,-0.270042,0.741063,1,0,0,1,-0.011017,-0.036741
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,...,0.119626,-0.993774,-0.023533,-1.008637,1,0,0,1,-0.185354,0.00661
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,...,0.119626,-0.993774,-0.176419,0.741063,1,0,0,1,0.035677,-0.03922
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,...,0.119626,-0.993774,0.550516,0.741063,1,0,0,1,0.350455,-0.011927
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,...,0.119626,-0.993774,0.193207,0.741063,1,0,0,1,-0.235897,0.05526


In [35]:
train_set = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
#col_list = ['call_B', 'call_C','day_A', 'day_B','day_C','TIME','DAY','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
col_list = ['call_B', 'call_C','rush', 'no_rush','weekday','weekend','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN', 'STAND_MEAN','YEAR','MONTH']
for col_name in col_list:
#     print(train_set.dtype)
    train_set = torch.cat((train_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [36]:
train_set = train_set.type('torch.FloatTensor')

In [37]:
target_set = torch.reshape(torch.tensor(df['target'].values),(-1,1))

In [38]:
train_set[0]

tensor([ 0.0000,  0.0000,  1.0000,  0.0000,  1.0000,  1.0000,  0.0000, -0.0110,
        -0.0367, -0.2700,  0.7411, -0.9938,  0.1196])

In [39]:
len(train_set[0])

13

train_set_linear = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    train_set_linear = torch.cat((train_set_linear,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

train_set_linear = train_set_linear.type('torch.FloatTensor')

model = torch.nn.Sequential(
  torch.nn.Linear(6, 1),
).to(device)

# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 5e-3
opt = torch.optim.SGD(model.parameters(), lr=lr)
# torch.optim.Adam

train_err = []
test_err = []
parameters = []
for i in range(10):
    model.train()

    y_pred = model(train_set_linear.to(device)) # Compute model outputs
    loss = loss_fn(y_pred, target_set.to(device)) # Compute MSE
    opt.zero_grad() # Must reset the gradients every step. Otherwise, gradients from previous iterations would cause interference!!!
    loss.backward() # Compute gradients of all parameters (our model) with respect to our computed loss value (a singular value)
    opt.step() # One gradient step

    train_err.append(loss.item())

    model.eval()
#     with torch.no_grad():
#     test_err.append(loss_fn(model(X_test), y_test).item())

model[0].weight

In [40]:
class MLP_Regressor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim = 1):
        super(MLP_Regressor, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        
        out_dim = 1
        
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim
                
        self.fc = torch.nn.Sequential(
            nn.Linear(self.in_dim, self.hidden_dim * 2),
            nn.SiLU(),
            nn.Linear(self.hidden_dim * 2, self.hidden_dim),
            nn.SiLU(),
            nn.Linear(self.hidden_dim, self.in_dim), 
            nn.SiLU(),
            nn.Linear(self.in_dim, self.out_dim)
        )

    
    def forward(self, x):
        x = self.fc(x)
        x = x.squeeze(1)
        return x

In [41]:
targets_mlp = torch.reshape(torch.from_numpy(df['target'].values),(-1,1))

In [42]:
batch_size = 32
lst_all = list(zip(train_set, targets_mlp))
np.random.shuffle(lst_all)
lst_train = lst_all[:int(len(lst_all) * 0.8)]
lst_valid = lst_all[int(len(lst_all) * 0.8):]
len_train = len(lst_train)
print(len(lst_train))
print(len(lst_valid))
#print(lst_train[0])
trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(lst_valid , batch_size=batch_size, shuffle=False, num_workers=2)

1339321
334831


In [43]:
lst_train[-1]

(tensor([ 0.0000,  1.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,  0.1484,
         -0.0185,  0.9802, -1.2669,  1.0063, -1.6415]),
 tensor([1125.]))

In [48]:
mlp_model = MLP_Regressor(13, 16, 1).to(device)
lr_mlp = 5e-6
optimizer = torch.optim.Adam(mlp_model.parameters(), lr = lr_mlp)
criterion = nn.MSELoss()

In [49]:
def get_loss(X, y, model, criterion):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    loss = torch.sqrt(loss)
    return loss

In [50]:
def validate(dataloader, model, criterion):
    valid_losses = []
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            loss = get_loss(X, y, model, criterion)
            valid_losses.append(loss.item())
    
    return np.mean(valid_losses)

In [None]:
from tqdm import tqdm
epoch_loss = []
valid_loss = np.inf
for epoch in tqdm(range(10)):

    running_loss = 0.0
    losses = []
    np.random.shuffle(lst_all)
    lst_train = lst_all[:int(len(lst_all) * 0.8)]
    lst_valid = lst_all[int(len(lst_all) * 0.8):]
    len_train = len(lst_train)
    trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=True, num_workers=2)
    validloader = torch.utils.data.DataLoader(lst_valid , batch_size=batch_size, shuffle=False, num_workers=2)
    for X, y in trainloader:
        X, y = X.to(device), y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        loss = get_loss(X, y, mlp_model, criterion)
        #RMSE
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

        running_loss += loss.cpu().detach().numpy()

    epoch_loss.append(running_loss)
    v_loss = validate(validloader, mlp_model, criterion)
    print(f'Train RMSE: {np.mean(losses)}')
    print(f'Valid RMSE: {v_loss}')
#     if v_loss > valid_loss or valid_loss - v_loss <= 5:
#         break
#     else:
#         valid_loss = v_loss
print('Finished Training')

In [52]:
stands = pd.read_csv("../metaData_taxistandsID_name_GPSlocation.csv")
stand_dict = {}
stands.head()
for r, s in stands.iterrows():
    stand_dict[s['ID']] = (np.float32(s['Longitude']), np.float32(s['Latitude']))

In [53]:
df_test = pd.read_csv("../test_public.csv")
df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['day_A'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'A'))
df_test['day_B'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'B'))
df_test['day_C'] = df_test["DAY_TYPE"].apply(lambda x : int(x == 'C'))
#df_test['TAXI_ID'] = (df_test['TAXI_ID'] - train_id_mean)/train_id_std
df_test['TIME'] = df_test['TIMESTAMP'].apply(parse_hour)
df_test['DAY'] = df_test['TIMESTAMP'].apply(parse_day)
df_test['MONTH'] = df_test['TIMESTAMP'].apply(parse_month)
df_test['YEAR'] = df_test['TIMESTAMP'].apply(parse_year)

In [54]:
df_test['weekday'] = df_test["DAY"].apply(lambda x : int(x <= 4))
df_test['weekend'] = df_test["DAY"].apply(lambda x : int(x >= 5))
df_test['rush'] = df_test['TIME'].apply(apply_rush_hour)
df_test['no_rush'] = df_test['TIME'].apply(apply_no_rush)

In [55]:
def apply_mean_test(taxi_id):
    if taxi_id in taxi_id_to_mean:
        return taxi_id_to_mean[taxi_id]
    else:
        return train_id_mean

In [56]:
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,call_A,call_B,...,day_B,day_C,TIME,DAY,MONTH,YEAR,weekday,weekend,rush,no_rush
0,T1,B,,15.0,20000542,1408039037,A,False,0,1,...,0,0,17,3,8,2014,1,0,1,0
1,T2,B,,57.0,20000108,1408038611,A,False,0,1,...,0,0,17,3,8,2014,1,0,1,0
2,T3,B,,15.0,20000370,1408038568,A,False,0,1,...,0,0,17,3,8,2014,1,0,1,0
3,T4,B,,53.0,20000492,1408039090,A,False,0,1,...,0,0,17,3,8,2014,1,0,1,0
4,T5,B,,18.0,20000621,1408039177,A,False,0,1,...,0,0,17,3,8,2014,1,0,1,0


In [57]:
def apply_long(x): #in: origin stand
    if not pd.isna(x):
        return stand_dict[x][0]
    else:
        return train_long_mean

In [58]:
def apply_lat(x):
    if not pd.isna(x):
        return stand_dict[x][1]
    else:
        return train_lat_mean

In [59]:
df_test['Init_longitude'] = df_test['ORIGIN_STAND'].apply(apply_long)
df_test['Init_latitude'] = df_test['ORIGIN_STAND'].apply(apply_lat)

In [60]:
df_test['Init_longitude'] = (df_test['Init_longitude'] - train_long_mean) / train_long_std
df_test['Init_latitude'] = (df_test['Init_latitude'] - train_lat_mean) / train_lat_std

In [61]:
df_test['MONTH'] = (df_test['MONTH'] - train_month_mean) / train_month_std
df_test['YEAR'] = (df_test['YEAR'] - train_year_mean) / train_year_std

In [62]:
df_test['TAXI_ID_MEAN'] = df_test['TAXI_ID'].apply(apply_mean_test)
df_test['STAND_MEAN'] = df_test['ORIGIN_STAND'].apply(apply_mean_stand)
df_test['TAXI_ID_MEAN'] = np.float32((df_test['TAXI_ID_MEAN'] - train_taxi_mean)/train_taxi_std)
df_test['STAND_MEAN'] = np.float32((df_test['STAND_MEAN'] - train_stand_mean)/train_stand_std)

In [63]:
df_test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,call_A,call_B,...,MONTH,YEAR,weekday,weekend,rush,no_rush,Init_longitude,Init_latitude,TAXI_ID_MEAN,STAND_MEAN
0,T1,B,,15.0,20000542,1408039037,A,False,0,1,...,0.413153,1.006265,1,0,1,0,0.258385,-0.019749,0.083604,0.776486
1,T2,B,,57.0,20000108,1408038611,A,False,0,1,...,0.413153,1.006265,1,0,1,0,0.054229,-0.026602,-0.263015,-0.789696
2,T3,B,,15.0,20000370,1408038568,A,False,0,1,...,0.413153,1.006265,1,0,1,0,0.258385,-0.019749,-0.737559,0.776486
3,T4,B,,53.0,20000492,1408039090,A,False,0,1,...,0.413153,1.006265,1,0,1,0,0.027052,-0.037217,-0.677073,-1.261319
4,T5,B,,18.0,20000621,1408039177,A,False,0,1,...,0.413153,1.006265,1,0,1,0,-0.018913,-0.020477,-0.688379,-0.097664


In [64]:
#    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','rush', 'no_rush','weekday','weekend','Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN', 'STAND_MEAN','YEAR','MONTH']
for col_name in col_list:
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [65]:
test_set[:5]

tensor([[ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.2584,
         -0.0197,  0.0836,  0.7765,  1.0063,  0.4132],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0542,
         -0.0266, -0.2630, -0.7897,  1.0063,  0.4132],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.2584,
         -0.0197, -0.7376,  0.7765,  1.0063,  0.4132],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  0.0271,
         -0.0372, -0.6771, -1.2613,  1.0063,  0.4132],
        [ 0.0000,  1.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000, -0.0189,
         -0.0205, -0.6884, -0.0977,  1.0063,  0.4132]])

In [66]:
test_set = test_set
#preds = best_lgbm.predict(test_set)
preds = mlp_model(test_set.to(device))


In [67]:
output_csv = pd.read_csv("../sampleSubmission.csv")
#output_csv['TRAVEL_TIME'] = preds
output_csv['TRAVEL_TIME'] = preds.cpu().detach().numpy()
#output_csv['TRAVEL_TIME'] = preds_taxi
#df_test["TRAVEL_TIME"] = 716.43
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)
