In [20]:
#IMPORT LIBRARIES

import pyarrow.parquet as pq
import torch
import random
import time
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [46]:
#READ IN CLEANED DATA FILE, SPLIT DATA AND CONVERT TO TENSORS

temp = pq.read_table('/content/drive/MyDrive/cleaned_taxi_data.parquet')
temp = temp.to_pandas()
print(temp.shape)
print(temp.columns)


RANDOM_SEED = 123
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 64
NUM_EPOCHS = 50

torch.manual_seed(RANDOM_SEED)


# Split the data
train_data, test_data = train_test_split(temp, test_size=0.2)
train_data, val_data = train_test_split(train_data, test_size=0.2)

# Convert data to PyTorch tensors
X_train, y_train = torch.tensor(train_data.drop('DOLocationID', axis=1).values), torch.tensor(train_data['DOLocationID'].values)
X_val, y_val = torch.tensor(val_data.drop('DOLocationID', axis=1).values), torch.tensor(val_data['DOLocationID'].values)
X_test, y_test = torch.tensor(test_data.drop('DOLocationID', axis=1).values), torch.tensor(test_data['DOLocationID'].values)


train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32)

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32)

(2276461, 31)
Index(['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID',
       'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type',
       'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'congestion_surcharge',
       'airport_fee', 'tip_pct', 'tip_bin', 'PU_day_in_june_2021',
       'PU_time_hour', 'PU_time_min', 'DO_day_in_june_2021', 'DO_time_hour',
       'DO_time_min', 'avg_temp', 'avg_dew_pt', 'avg_humidity',
       'avg_wind_speed', 'avg_pressure', 'total_precipitation'],
      dtype='object')


In [86]:
#CREATE NEURAL NET
class MLP(torch.nn.Module):

    def __init__(self, num_features, num_hidden_1, num_classes):
        super(MLP, self).__init__()

        self.num_classes = num_classes

        self.linear_1 = torch.nn.Linear(num_features, num_hidden_1)
        self.bn1 = torch.nn.BatchNorm1d(num_hidden_1)
        self.linear_out = torch.nn.Linear(num_hidden_1, num_classes)

        self.double() #from stackoverflow to solve a bug

    def forward(self, x):
        #x = x.view(-1, 960) #i think the problem is on this line
        out = self.linear_1(x)
        out = self.bn1(out)
        out = F.relu(out)
        logits = self.linear_out(out)
        probas = F.softmax(logits, dim=1)
        return logits, probas

#################################
### Model Initialization
#################################


random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

model = MLP(num_features=171,
            num_hidden_1=100,
            num_classes=4)


model = model.to(DEVICE)


optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [65]:
def compute_accuracy(net, data_loader):
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for features, targets in data_loader:
            features = features.view(-1, 960).to(DEVICE)
            targets = targets.to(DEVICE)
            a1, a2 = net.forward(features)
            predicted_labels = torch.argmax(a2, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
        return correct_pred.float()/num_examples * 100

In [87]:
#TRAIN MODEL


start_time = time.time()
minibatch_cost = []
epoch_cost = []
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, (features, targets) in enumerate(train_loader):

        features = features.view(-1, 960).to(DEVICE)
        targets = targets.to(DEVICE)

        ### FORWARD AND BACK PROP
        print(type(features))
        logits, probas = model(features) #the line that i keep facing issues/errors with

        #y_onehot = to_onehot(targets, model.num_classes).to(DEVICE)

        cost = F.cross_entropy(torch.log(probas), targets)
        optimizer.zero_grad()

        cost.backward()
        minibatch_cost.append(cost)
        ### UPDATE MODEL PARAMETERS
        optimizer.step()

        ### LOGGING
        if not batch_idx % 50:
            print ('Epoch: %03d/%03d | Batch %03d/%03d | Cost: %.4f'
                   %(epoch+1, NUM_EPOCHS, batch_idx,
                     len(train_loader), cost))

    cost = compute_accuracy(model, train_loader)
    epoch_cost.append(cost)
    print('Epoch: %03d/%03d Train Cost: %.4f' % (
            epoch+1, NUM_EPOCHS, cost))
    print('Time elapsed: %.2f min' % ((time.time() - start_time)/60))

print('Total Training Time: %.2f min' % ((time.time() - start_time)/60))

<class 'torch.Tensor'>


ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 100])