In [None]:
# I have already mounted my google drive with google colab before executing this code and using Colab T4:GPU to run this model.

# MODEL TRAINING

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

#Step1: Loading Preprocessing Data

print("="*60)
print("LOADING PREPROCESSED DATA")
print("="*60)

PROJECT_DIR = '/content/drive/MyDrive/Real_Estate_Project/'
preprocessed_path = os.path.join(PROJECT_DIR, 'preprocessed_data.pkl')

with open(preprocessed_path, 'rb') as f:
    data_dict = pickle.load(f)

X_train = data_dict['X_train']
y_train = data_dict['y_train']
X_val = data_dict['X_val']
y_val = data_dict['y_val']
X_test = data_dict['X_test']
train_property_ids = data_dict['train_property_ids']
val_property_ids = data_dict['val_property_ids']
test_property_ids = data_dict['test_property_ids']
IMAGE_PATH_MAP = data_dict['IMAGE_PATH_MAP']
train_transform = data_dict['train_transform']
val_test_transform = data_dict['val_test_transform']
IMG_SIZE = data_dict['IMG_SIZE']

print(f"Data loaded successfully.")
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
print(f"Image map size: {len(IMAGE_PATH_MAP)}")

# Step2: Optamizing Dataset classes using pre-built image path map.

class FastMultimodalDataset(Dataset):

    def __init__(self, tabular_data, labels, property_ids, image_path_map, transform=None):
        self.tabular_data = torch.FloatTensor(tabular_data.values)
        self.labels = torch.FloatTensor(labels.values) if labels is not None else None
        self.property_ids = property_ids
        self.image_path_map = image_path_map
        self.transform = transform

        self.blank_image = Image.new('RGB', (IMG_SIZE, IMG_SIZE), color=(128, 128, 128))

    def __len__(self):
        return len(self.property_ids)

    def __getitem__(self, idx):

        # To get tabular data
        tabular = self.tabular_data[idx]

        # To get property ID
        property_id = self.property_ids[idx]

        img_path = self.image_path_map.get(property_id)

        if img_path and os.path.exists(img_path):
            try:
                image = Image.open(img_path).convert('RGB')
            except:
                image = self.blank_image
        else:
            image = self.blank_image

        if self.transform:
            image = self.transform(image)

        if self.labels is not None:
            label = self.labels[idx]
            return image, tabular, label
        else:
            return image, tabular, property_id

# Creating datasets
print("\n" + "="*60)
print("CREATING DATASETS")
print("="*60)

train_dataset = FastMultimodalDataset(
    X_train, y_train, train_property_ids, IMAGE_PATH_MAP, train_transform
)
val_dataset = FastMultimodalDataset(
    X_val, y_val, val_property_ids, IMAGE_PATH_MAP, val_test_transform
)
test_dataset = FastMultimodalDataset(
    X_test, None, test_property_ids, IMAGE_PATH_MAP, val_test_transform
)

# Creating data loaders with optimized settings

BATCH_SIZE = 32
NUM_WORKERS = 2

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)
test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if torch.cuda.is_available() else False
)

print(f"Datasets created")
print(f"Batch size: {BATCH_SIZE}")
print(f"Train batches: {len(train_loader)}")

#Step 3: Defining The Model

class MultimodalRealEstateModel(nn.Module):
    def __init__(self, num_tabular_features, dropout_rate=0.3):
        super(MultimodalRealEstateModel, self).__init__()

        # Image branch (ResNet18)
        self.cnn = models.resnet18(pretrained=True)
        for param in list(self.cnn.parameters())[:-20]:
            param.requires_grad = False

        num_cnn_features = self.cnn.fc.in_features
        self.cnn = nn.Sequential(*list(self.cnn.children())[:-1])

        self.image_fc = nn.Sequential(
            nn.Linear(num_cnn_features, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        # Tabular branch
        self.tabular_fc = nn.Sequential(
            nn.Linear(num_tabular_features, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 32),
            nn.ReLU()
        )

        # Fusion layer
        self.fusion = nn.Sequential(
            nn.Linear(128 + 32, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(32, 1)
        )

    def forward(self, image, tabular):
        img_features = self.cnn(image)
        img_features = img_features.view(img_features.size(0), -1)
        img_features = self.image_fc(img_features)

        tab_features = self.tabular_fc(tabular)
        combined = torch.cat([img_features, tab_features], dim=1)
        output = self.fusion(combined)
        return output

num_features = X_train.shape[1]
model = MultimodalRealEstateModel(num_features, dropout_rate=0.3)
model = model.to(device)

print("\nModel initialized")
print(f"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

#Step4 : Training SetUp

criterion = nn.MSELoss()
optimizer = optim.Adam([
    {'params': model.cnn.parameters(), 'lr': 1e-5},
    {'params': model.image_fc.parameters(), 'lr': 1e-4},
    {'params': model.tabular_fc.parameters(), 'lr': 1e-3},
    {'params': model.fusion.parameters(), 'lr': 1e-3}
], weight_decay=1e-5)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3
)

# Step5: Training Functions

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0

    pbar = tqdm(loader, desc="Training")
    for images, tabular, labels in pbar:
        images = images.to(device, non_blocking=True)
        tabular = tabular.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(images, tabular)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        pbar.set_postfix({'loss': loss.item()})

    return running_loss / len(loader.dataset)

def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    predictions = []
    actuals = []

    with torch.no_grad():
        for images, tabular, labels in tqdm(loader, desc="Validating"):
            images = images.to(device, non_blocking=True)
            tabular = tabular.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True).unsqueeze(1)

            outputs = model(images, tabular)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)
            predictions.extend(outputs.cpu().numpy().flatten())
            actuals.extend(labels.cpu().numpy().flatten())

    epoch_loss = running_loss / len(loader.dataset)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)

    return epoch_loss, rmse, mae, r2, predictions, actuals

#Step6: Starting training Loop and saving the best model

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)

NUM_EPOCHS = 50
EARLY_STOP_PATIENCE = 10

best_val_rmse = float('inf')
patience_counter = 0
train_losses = []
val_losses = []
val_rmses = []
val_r2s = []

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    print("-" * 60)

    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)

    val_loss, val_rmse, val_mae, val_r2, val_preds, val_actuals = validate(
        model, val_loader, criterion, device
    )
    val_losses.append(val_loss)
    val_rmses.append(val_rmse)
    val_r2s.append(val_r2)

    print(f"\nTrain Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f} | RMSE: ${val_rmse:,.2f} | MAE: ${val_mae:,.2f} | R²: {val_r2:.4f}")

    scheduler.step(val_rmse)

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_val_r2 = val_r2
        model_path = os.path.join(PROJECT_DIR, 'best_model.pth')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_rmse': val_rmse,
            'val_r2': val_r2
        }, model_path)
        print(f"Best model saved! (RMSE: ${val_rmse:,.2f}, R²: {val_r2:.4f})")
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= EARLY_STOP_PATIENCE:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break

print("\n" + "="*60)
print(f"TRAINING COMPLETE!")
print(f"Best RMSE: ${best_val_rmse:,.2f}")
print(f"Best R²: {best_val_r2:.4f}")
print("="*60)
