In [6]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [7]:
df = pd.read_csv("../data/interim/cars.csv")
print(df.shape)
df.head(5)

(37705, 16)


Unnamed: 0,make,model,transmission,color,mileage,year,fuel,engine_capacity,body_type,has_warranty,drivetrain,price,age,miles_per_year,price_per_mile,price_per_year
0,subaru,outback,automatic,silver,190000,2010,gasoline,2.5,universal,False,all,10900.0,15,12666.666667,0.057368,726.666667
1,subaru,outback,automatic,blue,290000,2002,gasoline,3.0,universal,False,all,5000.0,23,12608.695652,0.017241,217.391304
2,subaru,forester,automatic,red,402000,2001,gasoline,2.5,suv,False,all,2800.0,24,16750.0,0.006965,116.666667
3,subaru,impreza,mechanical,blue,10000,1999,gasoline,3.0,sedan,False,all,9999.0,26,384.615385,0.9999,384.576923
4,subaru,legacy,automatic,black,280000,2001,gasoline,2.5,universal,False,all,2134.11,24,11666.666667,0.007622,88.92125


In [8]:
X = df.drop(columns=['price', 'has_warranty'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
numeric_features = ['mileage', 'year', 'engine_capacity', 'age', 'miles_per_year', 'price_per_mile', 'price_per_year']
numeric_transformer = StandardScaler()


categorical_features = ['make', 'model', 'transmission', 'color', 'fuel', 'body_type', 'drivetrain']
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [11]:
# Fit the transformer only on the training data and transform both training and test sets
X_train_processed = preprocessor.fit_transform(X_train)  # Fit and transform on training data
X_test_processed = preprocessor.transform(X_test)    



In [12]:
# Convert the sparse matrix to a dense NumPy array
X_train_processed_dense = X_train_processed.toarray()
X_test_processed_dense = X_test_processed.toarray()

# Now convert them to PyTorch tensors
X_train_tensor = torch.tensor(X_train_processed_dense, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_processed_dense, dtype=torch.float32)

# Convert the target variable (price) to tensors
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader for batching
from torch.utils.data import DataLoader, TensorDataset

train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)


In [13]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import mean_absolute_error

# Check if CUDA is available, if yes, use it, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Neural Network Model (DNN/MLP)
class CarPricePredictor(nn.Module):
    def __init__(self, input_dim):
        super(CarPricePredictor, self).__init__()
        # Defining layers: More layers and neurons can be added
        self.fc1 = nn.Linear(input_dim, 128)  # Input layer -> hidden layer 1
        self.fc2 = nn.Linear(128, 64)         # Hidden layer 1 -> hidden layer 2
        self.fc3 = nn.Linear(64, 32)          # Hidden layer 2 -> hidden layer 3
        self.fc4 = nn.Linear(32, 1)           # Hidden layer 3 -> output layer

    def forward(self, x):
        # Pass data through layers with ReLU activation
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)  # Output layer, no activation (regression task)
        return x

# Initialize the model and move it to the device (GPU or CPU)
input_dim = X_train_tensor.shape[1]  # Number of features after preprocessing
model = CarPricePredictor(input_dim).to(device)

# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Move data to GPU (if available)
X_train_tensor = X_train_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

# DataLoader for batching
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Move data to the device (GPU or CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_train_loss:.4f}')

# Evaluation on test data
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    test_loss = 0.0
    test_preds = []
    test_true = []
    for inputs, labels in test_loader:
        # Move data to the device (GPU or CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        # Collect predictions and true values for evaluation
        test_preds.append(outputs.cpu().numpy())
        test_true.append(labels.cpu().numpy())

    avg_test_loss = test_loss / len(test_loader)
    test_preds = np.concatenate(test_preds)
    test_true = np.concatenate(test_true)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(test_true, test_preds)
    print(f'Test Loss: {avg_test_loss:.4f}, MAE: {mae:.4f}')


Epoch [1/100], Loss: 16952467.7336
Epoch [2/100], Loss: 1958731.8232
Epoch [3/100], Loss: 1601906.7483
Epoch [4/100], Loss: 1479940.2123
Epoch [5/100], Loss: 1415205.3867
Epoch [6/100], Loss: 1371826.7989
Epoch [7/100], Loss: 1333150.3392
Epoch [8/100], Loss: 1310921.9692
Epoch [9/100], Loss: 1278838.8410
Epoch [10/100], Loss: 1252217.8271
Epoch [11/100], Loss: 1228435.5757
Epoch [12/100], Loss: 1216807.4246
Epoch [13/100], Loss: 1193872.6348
Epoch [14/100], Loss: 1180554.0407
Epoch [15/100], Loss: 1162067.1552
Epoch [16/100], Loss: 1151313.6325
Epoch [17/100], Loss: 1139444.8581
Epoch [18/100], Loss: 1136499.0320
Epoch [19/100], Loss: 1117076.4129
Epoch [20/100], Loss: 1102211.0709
Epoch [21/100], Loss: 1097191.2551
Epoch [22/100], Loss: 1086310.9441
Epoch [23/100], Loss: 1082338.0659
Epoch [24/100], Loss: 1065698.9899
Epoch [25/100], Loss: 1058351.1552
Epoch [26/100], Loss: 1055084.0806
Epoch [27/100], Loss: 1051616.0150
Epoch [28/100], Loss: 1039457.0566
Epoch [29/100], Loss: 103288