In [19]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import pandas as pd
import numpy as np

class MultimodalDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        # Handle both string paths and DataFrames
        self.data = pd.read_csv(csv_file) if isinstance(csv_file, str) else csv_file
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tab_features = torch.tensor(row.drop(['id', 'price_log'], errors='ignore').values, dtype=torch.float32)
        label = torch.tensor(row['price_log'], dtype=torch.float32)
        
        img_path = os.path.join(self.img_dir, f"{int(row['id'])}.jpg")
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, tab_features, label

image_transforms = transforms.Compose([
    transforms.Resize((224, 224)), # Standard size for ResNet
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [20]:
import torch.nn as nn
from torchvision import models

class PricePredictor(nn.Module):
    def __init__(self, num_tabular_cols):
        super(PricePredictor, self).__init__()
        self.resnet = models.resnet18(weights='DEFAULT')
        
        # UNFREEZE: Allow the CNN to learn from your images
        for param in self.resnet.parameters():
            param.requires_grad = True
            
        self.vision_features = nn.Sequential(*list(self.resnet.children())[:-1])
        self.tabular_branch = nn.Sequential(
            nn.Linear(num_tabular_cols, 32),
            nn.ReLU(),
            nn.Linear(32, 16)
        )
        self.regressor = nn.Sequential(
            nn.Linear(512 + 16, 64),
            nn.ReLU(),
            nn.Linear(64, 1) 
        )

    def forward(self, img, tab):
        v_feat = self.vision_features(img).view(img.size(0), -1)
        t_feat = self.tabular_branch(tab)
        combined = torch.cat((v_feat, t_feat), dim=1)
        return self.regressor(combined)

In [22]:
import joblib
import pandas as pd
import torch
import torch.nn as nn

# 1. Load Scaler
scaler = joblib.load('data/scaler.pkl')
num_features = len(scaler.feature_names_in_)
print(f"Scaler loaded. Using {num_features} features.")

# 2. Setup DataLoaders
train_dataset = MultimodalDataset('data/processed_train.csv', 'house_images', image_transforms)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# 3. Initialize Model (Only once!)
model = PricePredictor(num_tabular_cols=num_features)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 4. Optimizer - Using Adam with the 0.0001 rate that worked
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

print(f"Ready to train on {device}!")

Scaler loaded. Using 17 features.
Ready to train on cuda!


In [23]:
from tqdm import tqdm

epochs = 5 # Start with 5, as this gave you the 0.75 result before

for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    for images, tabular, labels in loop:
        images, tabular, labels = images.to(device), tabular.to(device), labels.to(device).view(-1, 1)
        
        optimizer.zero_grad()
        outputs = model(images, tabular)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
print(f"Epoch {epoch+1} finished. Avg Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/5: 100%|██████████| 803/803 [04:37<00:00,  2.89it/s, loss=0.282] 
Epoch 2/5: 100%|██████████| 803/803 [01:50<00:00,  7.24it/s, loss=0.00232]
Epoch 3/5: 100%|██████████| 803/803 [03:12<00:00,  4.17it/s, loss=0.231] 
Epoch 4/5: 100%|██████████| 803/803 [03:47<00:00,  3.53it/s, loss=0.269] 
Epoch 5/5: 100%|██████████| 803/803 [01:40<00:00,  8.03it/s, loss=0.0285] 

Epoch 5 finished. Avg Loss: 0.0547





In [24]:
# 1. Load the test CSV created in the Preprocessing Notebook
test_df = pd.read_csv('data/processed_test.csv')

# 2. Create the Dataset object
test_dataset = MultimodalDataset(
    csv_file=test_df, 
    img_dir='house_images', 
    transform=image_transforms
)

# 3. Create the Loader
# shuffle=False ensures we can match predictions to the right House IDs
test_loader = DataLoader(
    test_dataset, 
    batch_size=16, 
    shuffle=False
)

print(f"Test Loader defined with {len(test_df)} houses!")

Test Loader defined with 3209 houses!


In [25]:
model.eval() # Set model to 'testing' mode
all_preds = []
all_actuals = []

with torch.no_grad(): # Saves memory by not calculating gradients
    for images, tabular, labels in tqdm(test_loader, desc="Final Evaluation"):
        images, tabular = images.to(device), tabular.to(device)
        
        # Get predictions
        outputs = model(images, tabular)
        
        # 1. Move to CPU 
        # 2. Reverse the Log (np.expm1) to get real dollar values
        all_preds.extend(np.expm1(outputs.cpu().numpy()).flatten())
        all_actuals.extend(np.expm1(labels.numpy()).flatten())

print("Done! all_preds and all_actuals are now filled.")

Final Evaluation: 100%|██████████| 201/201 [00:28<00:00,  7.03it/s]

Done! all_preds and all_actuals are now filled.





In [26]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

# 1. Convert lists to numpy arrays
preds_array = np.array(all_preds)
actuals_array = np.array(all_actuals)

# 2. Calculate R2 Score (How much variance you explained)
r2 = r2_score(actuals_array, preds_array)

# 3. Calculate RMSE (Root Mean Squared Error)
# This shows the dollar error, penalizing big misses more heavily
rmse = np.sqrt(mean_squared_error(actuals_array, preds_array))

# 4. Calculate MAE (Mean Absolute Error) for comparison
mae = np.mean(np.abs(preds_array - actuals_array))

print(f"--- MODEL EVALUATION ---")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"MAE:  ${mae:,.2f}")

--- MODEL EVALUATION ---
R2 Score: 0.7685
RMSE: $164,563.46
MAE:  $101,772.33


In [27]:
# Look at your biggest mistakes
errors = np.abs(np.array(all_actuals) - np.array(all_preds))
print(f"Max Error: ${np.max(errors):,.2f}")

# Calculate R2 if we ignore houses over $1.5M
mask = np.array(all_actuals) < 1500000
print(f"R2 for houses under $1.5M: {r2_score(np.array(all_actuals)[mask], np.array(all_preds)[mask]):.4f}")

Max Error: $1,718,025.25
R2 for houses under $1.5M: 0.7098


In [29]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib

# 1. Load Data
train_df = pd.read_csv('data/processed_train.csv')
test_df = pd.read_csv('data/processed_test.csv')

# 2. Prepare Features (Exclude ID and Target)
# Only use the features your scaler knows about
scaler = joblib.load('data/scaler.pkl')
features = scaler.feature_names_in_

X_train = train_df[features]
y_train = train_df['price_log']
X_test = test_df[features]
y_test = test_df['price_log']

# 3. Train Tabular-Only Model (XGBoost)
tab_model = XGBRegressor(
    n_estimators=1000,     # Increase from 100 for deeper learning
    learning_rate=0.05,    # Lower rate prevents overshooting
    max_depth=8,           # Allows model to capture complex interactions
    subsample=0.8,         # Uses 80% of data per tree to prevent overfitting
    colsample_bytree=0.8,  # Uses 80% of features per tree
    random_state=42
)
tab_model.fit(X_train, y_train)

# 4. Predict and Convert back from Log-Scale
y_pred_log = tab_model.predict(X_test)

# Convert both back to actual dollars for the final report
y_pred_dollars = np.expm1(y_pred_log)
y_test_dollars = np.expm1(y_test)

# 5. Calculate Metrics
r2 = r2_score(y_test_dollars, y_pred_dollars)
rmse = np.sqrt(mean_squared_error(y_test_dollars, y_pred_dollars))

print(f"--- TABULAR ONLY PERFORMANCE ---")
print(f"R2 Score: {r2:.4f}")
print(f"RMSE: ${rmse:,.2f}")

--- TABULAR ONLY PERFORMANCE ---
R2 Score: 0.8961
RMSE: $110,245.46


In [None]:
import torch.nn.functional as F
import cv2

def generate_gradcam(model, img_tensor, tab_tensor):
    model.eval()
    
    # 1. Target the last convolutional layer in your vision branch 
    target_layer = model.vision_features[-1][-1].bn2 
    
    # 2. Hooks to capture gradients and activations
    gradients = []
    activations = []
    def save_gradient(grad): gradients.append(grad)
    def save_activation(module, input, output): activations.append(output)
    
    target_layer.register_forward_hook(save_activation)
    
    # 3. Forward pass
    output = model(img_tensor.unsqueeze(0), tab_tensor.unsqueeze(0))
    model.zero_grad()
    
    # 4. Backward pass to get gradients
    output.backward()
    target_layer.weight.register_hook(save_gradient)
    
    # 5. Calculate weights and heatmap
    weights = torch.mean(gradients[0], dim=(2, 3), keepdim=True)
    cam = torch.sum(weights * activations[0], dim=1).squeeze().detach().cpu().numpy()
    
    # 6. Normalize and resize
    cam = np.maximum(cam, 0)
    cam = cv2.resize(cam, (224, 224))
    cam = (cam - cam.min()) / (cam.max() - cam.min())
    return cam