In [1]:
# libraries

import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import StandardScaler

from PIL import Image
from tqdm import tqdm

import joblib
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# copy of training images

!cp -r /content/drive/MyDrive/cdc_project/train_images/train /content/train_images


In [4]:
# copy of testing images

!cp -r /content/drive/MyDrive/cdc_project/test_images/test /content/test_images

In [5]:
# data paths :

BASE_PATH = "/content/drive/MyDrive/cdc_project"

TRAIN_CSV_PREPROCESSED = f"{BASE_PATH}/train_preprocessed.csv"
TEST_CSV_PREPROCESSED = f"{BASE_PATH}/test_preprocessed.csv"

TRAIN_IMG_PATH = "/content/train_images"
TEST_IMG_PATH = "/content/test_images"

In [6]:
df_train = pd.read_csv(TRAIN_CSV_PREPROCESSED)
df_test = pd.read_csv(TEST_CSV_PREPROCESSED)

df_train["id"] = df_train["id"].astype(str)
df_test["id"] = df_test["id"].astype(str)

In [7]:
FEATURE_COLS = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'waterfront', 'view', 'condition', 'grade', 'sqft_above',
    'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
    'sqft_living15', 'sqft_lot15', 'sqft_per_bed', 'bath_per_bed',
    'lot_living_ratio', 'basements_ratio', 'above_floors_ratio',
    'living_density', 'lot_density', 'year_sale', 'month_sale',
    'quarter_sale', 'days_since_start', 'renovation',
    'years_since_renovation'
]

# training input features and output prices
x = df_train[FEATURE_COLS].values
y = df_train["price"].values
ids_train = df_train["id"].values

scaler_x = StandardScaler()
x_scaled = scaler_x.fit_transform(x)

x_test_scaled = scaler_x.transform(df_test[FEATURE_COLS].values)
ids_test = df_test["id"].values

y_log = np.log1p(y)

y_mean = y_log.mean()
y_std = y_log.std()

y_scaled_train = (y_log - y_mean) / y_std


In [8]:
img_tfms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean = [0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [9]:
class HouseDataset(Dataset):

  def __init__(self, x_tab, ids, image_dir, transform, y=None):
    self.x_tab = x_tab
    self.y = y
    self.ids = ids
    self.image_dir = image_dir
    self.transform = transform

  def __len__(self):
    return len(self.ids)

  def __getitem__(self, idx):

    x = torch.tensor(self.x_tab[idx], dtype = torch.float32)

    img_path = os.path.join(self.image_dir, f"{self.ids[idx]}.png")
    image = Image.open(img_path).convert("RGB")
    image = self.transform(image)

    if self.y is not None:
      y = torch.tensor(self.y[idx], dtype = torch.float32)
      return image, x, y

    else:
      return image, x, self.ids[idx]


In [10]:
train_ds = HouseDataset(x_scaled, ids_train, TRAIN_IMG_PATH, img_tfms, y_scaled_train)
test_ds = HouseDataset(x_test_scaled, ids_test, TEST_IMG_PATH, img_tfms)

train_loader = DataLoader(train_ds, batch_size = 32, shuffle = True, num_workers=2)
test_loader = DataLoader(test_ds, batch_size = 32, shuffle = False, num_workers = 2)

In [11]:
class MultiModalRegressor(nn.Module):

  def __init__(self, tab_dim):
    super().__init__()

    self.cnn = models.resnet18(pretrained = True)
    self.cnn.fc = nn.Identity()

    for p in self.cnn.parameters():
      p.requires_grad = False


    self.mlp = nn.Sequential(
        nn.Linear(tab_dim, 128),
        nn.ReLU(),
        nn.BatchNorm1d(128),
        nn.Dropout(0.3)
    )

    self.regressor = nn.Sequential(
        nn.Linear(128 + 512, 128),
        nn.ReLU(),
        nn.Linear(128,1)
    )
  def forward(self, image, tabular):
    img_feat = self.cnn(image)
    tab_feat = self.mlp(tabular)

    x = torch.cat([img_feat, tab_feat], dim = 1)

    return self.regressor(x).squeeze(1)


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultiModalRegressor( tab_dim = x_scaled.shape[1]).to(device)

criterion = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 151MB/s]


In [17]:
EPOCH = 5
epoch_loss = 0

for epoch in range(EPOCH):
  model.train()

  for img, x_tab, y in train_loader:

    img, x_tab, y = img.to(device), x_tab.to(device), y.to(device)

    optimizer.zero_grad()
    pred = model(img, x_tab)
    loss = criterion(pred, y)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()

  epoch_loss /= len(train_loader)
  print(f"Epoch {epoch+1} | Avg Loss: {epoch_loss:.4f}")

Epoch 1 | Avg Loss: 0.1373
Epoch 2 | Avg Loss: 0.1377
Epoch 3 | Avg Loss: 0.1326
Epoch 4 | Avg Loss: 0.1286
Epoch 5 | Avg Loss: 0.1327


In [18]:
SAVE_DIR = "/content/drive/MyDrive/cdc_project/models_current"
os.makedirs(SAVE_DIR, exist_ok = True)

torch.save(model.state_dict(), f"{SAVE_DIR}/multimodal_model.pth")
joblib.dump(scaler_x, f"{SAVE_DIR}/tabular_scaler.pkl")

np.savez(
    f"{SAVE_DIR}/target_norm.npz",
    y_mean = y_mean,
    y_std = y_std
)

In [19]:
model.eval()

all_preds = []
all_ids = []

with torch.no_grad():
  for img, x_tab_test, ids in test_loader:
    img, x_tab_test = img.to(device), x_tab_test.to(device)

    pred = model(img, x_tab_test)
    all_preds.append(pred.cpu().numpy())
    all_ids.extend(ids)

all_preds = np.concatenate(all_preds)

preds_log = all_preds * y_std + y_mean

preds_orig = np.expm1(preds_log)

In [20]:
submission_df = pd.DataFrame({
    "id": all_ids,
    "predicted_price": preds_orig
})

submission_df.to_csv("/content/drive/MyDrive/cdc_project/predictions_final/sub_df.csv", index = False)

In [21]:
df_test.shape


(5404, 33)

In [22]:
submission_df.shape

(5404, 2)

In [23]:
submission_df.describe()

Unnamed: 0,predicted_price
count,5404.0
mean,517652.4
std,336259.5
min,100393.1
25%,326905.4
50%,451266.6
75%,612326.9
max,6470630.0
