In [58]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [60]:
df = pd.read_csv("train.csv")

In [61]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [62]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))

In [63]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [64]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]


In [65]:
def one_hot(str):
    if str == 'A':
        return [1,0,0]
    elif str == 'B':
        return [0,1,0]
    elif str == 'C':
        return [0,0,1]
    else:
        return None

In [66]:
# df['CALL_TYPE'] = df['CALL_TYPE'].apply(lambda x: one_hot(x))
# df['DAY_TYPE'] = df['DAY_TYPE'].apply(lambda x: one_hot(x))
# df = df.loc[df['CALL_TYPE'] != None]
# df = df.loc[df['DAY_TYPE'] != None]

In [67]:
from datetime import datetime
def parse_time(x):
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

In [68]:
train_id_mean = df['TAXI_ID'].mean()
train_id_std = df['TAXI_ID'].std()
df['TAXI_ID'] = (df['TAXI_ID'] - train_id_mean)/ train_id_std

In [69]:
taxi_ids = set()
for ind, r in df.iterrows():
    taxi_ids.add(r['TAXI_ID'])
taxi_id_to_mean = {}
for i in taxi_ids:
    taxi_id_to_mean[i] = df[df['TAXI_ID'] == i]['target'].mean()
taxi_id_to_mean

{-0.5612990694750604: 499.58502197265625,
 0.5055998999262465: 726.5753173828125,
 -0.5518155230803821: 707.8585205078125,
 1.0509038176202479: 695.443603515625,
 1.0461620444229087: 690.2947387695312,
 1.0651291372122653: 798.5908813476562,
 1.525081137354162: 719.8406982421875,
 0.47714926074221164: 640.8653564453125,
 1.496630498170127: 764.616455078125,
 1.0746126836069436: 779.3790893554688,
 1.0414202712255696: 875.1163330078125,
 2.6156889727421646: 717.3923950195312,
 -1.2583397294839143: 766.3195190429688,
 -1.485944842956193: 828.130859375,
 -0.05392933735977221: 791.2107543945312,
 0.6573366422410991: 819.96435546875,
 1.1172886423829957: 735.7216796875,
 -0.058671110557111356: 758.9268188476562,
 -0.6039750282511127: 713.2156982421875,
 1.0793544568042828: 698.7357177734375,
 1.5393064569461794: 777.439208984375,
 2.6109471995448255: 649.6943359375,
 -0.9264156056701743: 648.847412109375,
 -1.4812030697588539: 694.540771484375,
 2.995030828529296: 30.0,
 1.122030415580335: 

In [70]:
def apply_mean(taxi_id):
    return taxi_id_to_mean[taxi_id]

In [71]:
df['TAXI_ID_MEAN'] = df['TAXI_ID'].apply(apply_mean)
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,call_A,call_B,call_C,TAXI_ID_MEAN
0,1372636858620000589,C,,,1.136256,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,0,0,1,694.473022
1,1372637303620000596,B,,7.0,1.169448,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,0,1,0,728.770874
2,1372636951620000320,C,,,-0.139281,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,0,0,1,707.499146
3,1372636854620000520,C,,,0.809073,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,0,0,1,808.640808
4,1372637091620000337,C,,,-0.058671,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,0,0,1,758.926819


In [72]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [73]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [74]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]
train_long_mean = df['Init_longitude'].mean()
train_lat_mean = df['Init_latitude'].mean()
train_long_std = df['Init_longitude'].std()
train_lat_std = df['Init_latitude'].std()
normalize('Init_longitude')
normalize('Init_latitude')

In [75]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,target,call_A,call_B,call_C,TAXI_ID_MEAN,Init_longitude,Init_latitude
0,1372636858620000589,C,,,1.136256,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[...",330.0,0,0,1,694.473022,0.96109,0.961183
1,1372637303620000596,B,,7.0,1.169448,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[...",270.0,0,1,0,728.770874,0.786753,1.004535
2,1372636951620000320,C,,,-0.139281,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-...",960.0,0,0,1,707.499146,1.007784,0.958705
3,1372636854620000520,C,,,0.809073,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[...",630.0,0,0,1,808.640808,1.322562,0.985998
4,1372637091620000337,C,,,-0.058671,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-...",420.0,0,0,1,758.926819,0.73621,1.053185


In [76]:
train_set = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
for col_name in col_list:
#     print(train_set.dtype)
    train_set = torch.cat((train_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [77]:
train_set = train_set.type('torch.FloatTensor')

In [78]:
target_set = torch.reshape(torch.tensor(df['target'].values),(-1,1))

In [79]:
train_set[0]

tensor([  0.0000,   0.0000,   1.0000,   1.1363,   0.9611,   0.9612, 694.4730])

In [78]:
train_set_linear = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    train_set_linear = torch.cat((train_set_linear,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [81]:
train_set_linear = train_set_linear.type('torch.FloatTensor')

In [82]:
model = torch.nn.Sequential(
  torch.nn.Linear(6, 1),
).to(device)

# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 5e-3
opt = torch.optim.SGD(model.parameters(), lr=lr)
# torch.optim.Adam

In [83]:
train_err = []
test_err = []
parameters = []
for i in range(10):
    model.train()

    y_pred = model(train_set_linear.to(device)) # Compute model outputs
    loss = loss_fn(y_pred, target_set.to(device)) # Compute MSE
    opt.zero_grad() # Must reset the gradients every step. Otherwise, gradients from previous iterations would cause interference!!!
    loss.backward() # Compute gradients of all parameters (our model) with respect to our computed loss value (a singular value)
    opt.step() # One gradient step

    train_err.append(loss.item())

    model.eval()
#     with torch.no_grad():
#     test_err.append(loss_fn(model(X_test), y_test).item())

In [80]:
model[0].weight

NameError: name 'model' is not defined

In [121]:
class MLP_Regressor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim = 1):
        super(MLP_Regressor, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        
        out_dim = 1
        
        self.model = torch.nn.Sequential( #an ordered container of modules
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
        )   
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim
        self.linear1 = nn.Linear(self.in_dim, self.hidden_dim)
        self.linear2 = nn.Linear(self.hidden_dim, self.out_dim)
        
                
        self.fc = torch.nn.Sequential(
            nn.Linear(self.in_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.hidden_dim * 4),
            nn.ReLU(),
            nn.Linear(self.hidden_dim * 4, self.hidden_dim * 2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim * 2, self.out_dim)
        )

    
    def forward(self, x):
        x = self.fc(x)
        x = x.squeeze(1)
        return x

In [83]:
targets_mlp = torch.reshape(torch.from_numpy(df['target'].values),(-1,1))

In [109]:
len(targets_mlp)

1674152

In [122]:
batch_size = 1024
lst_train = list(zip(train_set, targets_mlp))
len_train = len(lst_train)
print(len(lst_train))
print(lst_train[0])
trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=False, num_workers=2)

1674152
(tensor([  0.0000,   0.0000,   1.0000,   1.1363,   0.9611,   0.9612, 694.4730]), tensor([330.]))


In [123]:
mlp_model = MLP_Regressor(7, 14, 1).to(device)
lr_mlp = 5e-6
optimizer = torch.optim.Adam(mlp_model.parameters(), lr = lr_mlp)
criterion = nn.MSELoss()

In [124]:
def get_loss(X, y, model, criterion):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    return loss

In [125]:
losses = []

In [126]:
from tqdm import tqdm
for epoch in tqdm(range(10)):

    running_loss = 0.0

    for X, y in trainloader:
        X, y = X.to(device), y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        loss = get_loss(X, y, mlp_model, criterion)
        #RMSE
        loss = torch.sqrt(loss)
        losses.append(loss)
        loss.backward()
        optimizer.step()

        running_loss += loss.cpu().detach().numpy()


    print(f'Avg loss: {running_loss / len_train}')
    print(f'Total loss: {running_loss}')
print('Finished Training')

 10%|█         | 1/10 [02:40<24:06, 160.71s/it]


Avg loss: 0.9565425754480384


 20%|██        | 2/10 [05:26<21:47, 163.43s/it]


Avg loss: 0.9343666862790134


 30%|███       | 3/10 [08:03<18:44, 160.65s/it]


Avg loss: 0.9107177331375381


 40%|████      | 4/10 [10:41<15:57, 159.58s/it]


Avg loss: 0.8824370352375365


 50%|█████     | 5/10 [13:20<13:16, 159.29s/it]


Avg loss: 0.8487224096253165


 60%|██████    | 6/10 [16:00<10:38, 159.73s/it]


Avg loss: 0.8097002516224912


 70%|███████   | 7/10 [18:52<08:10, 163.58s/it]


Avg loss: 0.7663164134373491


 80%|████████  | 8/10 [21:37<05:28, 164.04s/it]


Avg loss: 0.7245406024443002


 90%|█████████ | 9/10 [24:14<02:41, 161.78s/it]


Avg loss: 0.6882918853910699


100%|██████████| 10/10 [27:03<00:00, 162.31s/it]


Avg loss: 0.666327285360102
Finished Training





In [127]:
stands = pd.read_csv("metaData_taxistandsID_name_GPSlocation.csv")
stand_dict = {}
stands.head()
for r, s in stands.iterrows():
    stand_dict[s['ID']] = (np.float32(s['Longitude']), np.float32(s['Latitude']))

In [128]:
df_test = pd.read_csv("test_public.csv")
df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['TAXI_ID'] = (df_test['TAXI_ID'] - train_id_mean)/train_id_std

df_test['Init_longitude'] = train_long_mean
df_test['Init_latitude'] = train_lat_mean

In [129]:
def apply_mean_test(taxi_id):
    if taxi_id in taxi_id_to_mean:
        return taxi_id_to_mean[taxi_id]
    else:
        return train_id_mean

In [130]:
df_test['TAXI_ID_MEAN'] = df_test['TAXI_ID'].apply(apply_mean_test)

In [131]:
for ind, d in df_test.iterrows():
    if not pd.isna(d['ORIGIN_STAND']):
        #print(d['ORIGIN_STAND'])
        d['Init_longitude'] = stand_dict[d['ORIGIN_STAND']][0]
        d['Init_latitude'] = stand_dict[d['ORIGIN_STAND']][1]
    d['Init_longitude'] = (d['Init_longitude'] - train_long_mean) / train_long_std
    d['Init_latitude'] = (d['Init_latitude'] - train_lat_mean) / train_lat_std

#    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude', 'TAXI_ID_MEAN']
#col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [132]:
test_set = test_set
preds = mlp_model(test_set.to(device))
#preds = model(test_set.to(device))


In [133]:
len(test_set)

320

In [135]:
output_csv = pd.read_csv("sampleSubmission.csv")
output_csv['TRAVEL_TIME'] = preds.cpu().detach().numpy()
#df_test["TRAVEL_TIME"] = 716.43
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)


In [None]:
#visualizing
#lst = [(1,2), (3,4)]
#plt.plot(lst)