In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pyarrow.parquet as pq

# Load and preprocess the data
data = pq.read_table('cleaned_taxi_data.parquet')
data = data.to_pandas()
data['tips_percentage'] = (data['tip_amount'] / data['total_amount']) * 100


In [2]:
data.columns

Index(['passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount',
       'congestion_surcharge',
       ...
       'store_and_fwd_flag_1', 'payment_type_1', 'payment_type_2',
       'payment_type_3', 'payment_type_4', 'tip_bin_0', 'tip_bin_1',
       'tip_bin_2', 'tip_bin_3', 'tips_percentage'],
      dtype='object', length=177)

In [3]:
data.describe()

Unnamed: 0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,...,store_and_fwd_flag_1,payment_type_1,payment_type_2,payment_type_3,payment_type_4,tip_bin_0,tip_bin_1,tip_bin_2,tip_bin_3,tips_percentage
count,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,...,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0,2276461.0
mean,1.472616,2.017087,9.89887,1.023657,0.4994395,1.966696,0.0041969,0.2999851,15.4309,2.434673,...,0.007788844,0.7807847,0.2155697,0.002394945,0.001250625,0.3378955,0.5871385,0.07437246,0.000593465,12.04877
std,1.039033,1.671309,5.098086,1.194539,0.01673157,1.719994,0.1745916,0.002113585,6.083246,0.3988098,...,0.08791008,0.4137149,0.4112171,0.04887955,0.03534207,0.4729929,0.4923484,0.2623761,0.02435391,7.865103
min,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.02,6.5,0.0,0.5,0.8,0.0,0.3,11.3,2.5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.795991
50%,1.0,1.61,8.5,0.5,0.5,2.0,0.0,0.3,14.16,2.5,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,16.61017
75%,2.0,2.52,12.0,2.5,0.5,2.86,0.0,0.3,17.9,2.5,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,16.66667
max,7.0,99.9,300.0,45.5,0.5,350.0,26.2,0.3,357.3,2.75,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,99.4


In [4]:
X = data.drop(['tip_bin_0', 'tip_bin_1', 'tip_bin_2', 'tip_bin_3'], axis=1)
y = data[['tip_bin_0', 'tip_bin_1', 'tip_bin_2', 'tip_bin_3']]  # One-hot encoded labels

# Convert one-hot encoded labels to class indices for PyTorch CrossEntropyLoss
y_class_indices = np.argmax(y.values, axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y_class_indices, test_size=0.3, random_state=1, stratify=y_class_indices)

In [5]:
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert the pandas dataframes to PyTorch tensors
X_train_tensor = torch.tensor(X_train.astype(np.float32))
y_train_tensor = torch.tensor(y_train.astype(np.int64))
X_test_tensor = torch.tensor(X_test.astype(np.float32))
y_test_tensor = torch.tensor(y_test.astype(np.int64))

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)




In [6]:
# Define the Softmax Regression Model
class SoftmaxRegression(nn.Module):
    def __init__(self, input_dim):
        super(SoftmaxRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 4)  # Output layer for 4 classes
        
    def forward(self, x):
        return self.linear(x)  

# Initialize model with the number of input features
model = SoftmaxRegression(X_train.shape[1])

# Define the optimizer and loss function
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()



In [7]:
# Training function
def train_epoch(model, train_loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)  # targets are class indices now
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

In [None]:
# Train the model
for epoch in range(10):  # train for 10 epochs
    loss = train_epoch(model, train_loader, criterion, optimizer)
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}')



Epoch 1, Loss: 0.1809
Epoch 2, Loss: 0.1096
Epoch 3, Loss: 0.0940
Epoch 4, Loss: 0.0859
Epoch 5, Loss: 0.0805
Epoch 6, Loss: 0.0767
Epoch 7, Loss: 0.0738
Epoch 8, Loss: 0.0716
Epoch 9, Loss: 0.0697


In [None]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    accuracy = 100 * correct / total
    return accuracy



In [None]:
# Evaluate the model
accuracy = evaluate_model(model, test_loader)
print(f'Accuracy: {accuracy:.2f}%')