In [89]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn

In [90]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [91]:
df = pd.read_csv("train.csv")

In [92]:
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [93]:
df["target"] = df["POLYLINE"].apply(lambda x : np.float32(max((x.count("[") - 2) *15, 0)))

In [94]:
df['call_A'] = df["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df['call_B'] = df["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df['call_C'] = df["CALL_TYPE"].apply(lambda x : int(x == 'C'))

In [95]:
df = df.loc[df['MISSING_DATA'] == False]
df = df.loc[df['target'] > 0]


In [96]:
def one_hot(str):
    if str == 'A':
        return [1,0,0]
    elif str == 'B':
        return [0,1,0]
    elif str == 'C':
        return [0,0,1]
    else:
        return None

In [97]:
# df['CALL_TYPE'] = df['CALL_TYPE'].apply(lambda x: one_hot(x))
# df['DAY_TYPE'] = df['DAY_TYPE'].apply(lambda x: one_hot(x))
# df = df.loc[df['CALL_TYPE'] != None]
# df = df.loc[df['DAY_TYPE'] != None]

In [98]:
from datetime import datetime
def parse_time(x):
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

In [99]:
df['TAXI_ID'] = (df['TAXI_ID'] - df['TAXI_ID'].mean())/df['TAXI_ID'].std()

In [100]:
def normalize(col_name):
    df[col_name] = np.float32((df[col_name] - df[col_name].mean())/df[col_name].std())

In [101]:
def parse_num(str, index):
    comma_pos = str.find(',')
    if comma_pos == -1:
        return None
    if index == 0:
        return np.float32(str[2:comma_pos])
    else:
        return np.float32(str[comma_pos + 1:str.find(']')])

In [102]:
df['Init_longitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,0))
df['Init_latitude'] = df['POLYLINE'].apply(lambda x : parse_num(x,1))
df = df.loc[df['Init_longitude'] != None]
df = df.loc[df['Init_latitude'] != None]
normalize('Init_longitude')
normalize('Init_latitude')

In [103]:
train_set = torch.reshape(torch.tensor(df['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    train_set = torch.cat((train_set,torch.reshape(torch.from_numpy(df[col_name].values),(-1,1))),1)

In [104]:
train_set = train_set.type('torch.FloatTensor')

In [105]:
target_set = torch.reshape(torch.tensor(df['target'].values),(-1,1))

In [106]:
train_set[0]

tensor([0.0000, 0.0000, 1.0000, 1.1363, 0.9611, 0.9612])

In [107]:
model = torch.nn.Sequential(
  torch.nn.Linear(6, 1),
)

# Define Loss Function / Objective Function
loss_fn = torch.nn.MSELoss()

# Define optimizer (this will perform your parameter updates use)
lr = 5e-3
opt = torch.optim.SGD(model.parameters(), lr=lr)
# torch.optim.Adam

In [108]:
train_err = []
test_err = []
parameters = []
for i in range(10):
    model.train()

    y_pred = model(train_set) # Compute model outputs
    loss = loss_fn(y_pred, target_set) # Compute MSE
    opt.zero_grad() # Must reset the gradients every step. Otherwise, gradients from previous iterations would cause interference!!!
    loss.backward() # Compute gradients of all parameters (our model) with respect to our computed loss value (a singular value)
    opt.step() # One gradient step

    train_err.append(loss.item())

    model.eval()
#     with torch.no_grad():
#     test_err.append(loss_fn(model(X_test), y_test).item())

In [156]:
model[0].weight

Parameter containing:
tensor([[14.1069, 27.8775, 21.5177,  0.5821, 60.8627, 63.4547]],
       requires_grad=True)

In [193]:
class MLP_Regressor(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim = 1):
        super(MLP_Regressor, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        
        out_dim = 1
        
        self.model = torch.nn.Sequential( #an ordered container of modules
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
        )   
        self.in_dim = in_dim
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim
        self.linear1 = nn.Linear(self.in_dim, self.hidden_dim)
        self.linear2 = nn.Linear(self.hidden_dim, self.out_dim)
        
                
        self.fc = torch.nn.Sequential(
            nn.Linear(self.in_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.out_dim),
        )

    
    def forward(self, x):
        x = self.fc(x)
        x = x.squeeze(1)
        return x

In [194]:
targets_mlp = torch.reshape(torch.from_numpy(df['target'].values),(-1,1))

In [195]:
len(targets_mlp)

1674152

In [208]:
batch_size = 256
lst_train = list(zip(train_set[:200000], targets_mlp[:200000]))
print(len(lst_train))
print(lst_train[0])
trainloader = torch.utils.data.DataLoader(lst_train , batch_size=batch_size, shuffle=True, num_workers=2)

200000
(tensor([0.0000, 0.0000, 1.0000, 1.1363, 0.9611, 0.9612]), tensor([330.]))


In [209]:
mlp_model = MLP_Regressor(6, 24, 1).to(device)
lr_mlp = 0.001
optimizer = torch.optim.Adam(mlp_model.parameters(), lr = lr_mlp)
criterion = nn.MSELoss()

In [210]:
def get_loss(X, y, model, criterion):
    y_pred = model(X)
    loss = criterion(y_pred, y)
    return loss

In [211]:
from tqdm import tqdm
for epoch in tqdm(range(10)):

    running_loss = 0.0

    for X, y in trainloader:
        X, y = X.to(device), y.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        loss = get_loss(X, y, mlp_model, criterion)
        #RMSE
        loss = torch.sqrt(loss)
        loss.backward()
        optimizer.step()

        running_loss += loss.cpu().detach().numpy()


    print(f'\n\n loss: {running_loss}')

print('Finished Training')

 10%|█         | 1/10 [00:23<03:30, 23.40s/it]



 loss: 718447.8248291016


 20%|██        | 2/10 [00:46<03:06, 23.28s/it]



 loss: 611131.9354553223


 30%|███       | 3/10 [01:09<02:40, 22.91s/it]



 loss: 504440.87646484375


 40%|████      | 4/10 [01:32<02:18, 23.14s/it]



 loss: 480337.80853271484


 50%|█████     | 5/10 [01:56<01:57, 23.47s/it]



 loss: 479378.8190307617


 60%|██████    | 6/10 [02:19<01:33, 23.42s/it]



 loss: 479184.50494384766


 70%|███████   | 7/10 [02:43<01:10, 23.59s/it]



 loss: 477961.47326660156


 80%|████████  | 8/10 [03:07<00:47, 23.66s/it]



 loss: 478430.86932373047


 90%|█████████ | 9/10 [03:31<00:23, 23.81s/it]



 loss: 478173.1703796387


100%|██████████| 10/10 [03:55<00:00, 23.55s/it]



 loss: 477743.85317993164
Finished Training





In [212]:
df_test = pd.read_csv("test_public.csv")
df_test['call_A'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'A'))
df_test['call_B'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'B'))
df_test['call_C'] = df_test["CALL_TYPE"].apply(lambda x : int(x == 'C'))
df_test['TAXI_ID'] = (df_test['TAXI_ID'] - df_test['TAXI_ID'].mean())/df['TAXI_ID'].std()
long = df['Init_longitude'].mean()
lat = df['Init_latitude'].mean()
df_test['Init_longitude'] = long
df_test['Init_latitude'] = lat
test_set = torch.reshape(torch.tensor(df_test['call_A'].values),(-1,1))
col_list = ['call_B', 'call_C','TAXI_ID', 'Init_longitude', 'Init_latitude']
for col_name in col_list:
#     print(train_set.dtype)
    test_set = torch.cat((test_set,torch.reshape(torch.from_numpy(df_test[col_name].values),(-1,1))),1)
test_set = test_set.type('torch.FloatTensor')

In [213]:
test_set = test_set
preds = mlp_model(test_set.to(device))

In [214]:
len(test_set)

320

AttributeError: module 'torch.nn' has no attribute 'normalize'

In [223]:
output_csv = pd.read_csv("sampleSubmission.csv")
#output_csv['TRAVEL_TIME'] = preds.cpu().detach().numpy()
df_test["TRAVEL_TIME"] = 716.43
output_csv.tail()
output_csv.to_csv("my_pred.csv", index=None)
