In [1]:
# Before executing I have mount my google drive with google colab.

#DATA PREPROCESSING

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torchvision import transforms
from PIL import Image
import os
import pickle
from tqdm import tqdm

#step 1: Data Loading

print("="*60)
print("LOADING DATA")
print("="*60)

PROJECT_DIR = '/content/drive/MyDrive/Real_Estate_Project/'
IMAGE_DIR = os.path.join(PROJECT_DIR, 'images/')

train_df = pd.read_excel(os.path.join(PROJECT_DIR, 'train(1).xlsx'))
test_df = pd.read_excel(os.path.join(PROJECT_DIR, 'test2.xlsx'))

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Image directory: {IMAGE_DIR}")

#step2: Image mapping

print("\n" + "="*60)
print("BUILDING IMAGE PATH MAPPING")
print("="*60)

def build_image_path_map(img_dir):

    image_map = {}

    # To Get all image files
    all_files = os.listdir(img_dir)

    for filename in tqdm(all_files, desc="Mapping images"):

        base_name = os.path.splitext(filename)[0]

        try:
            property_id = int(base_name)
            image_map[property_id] = os.path.join(img_dir, filename)
        except ValueError:
            continue

    return image_map

IMAGE_PATH_MAP = build_image_path_map(IMAGE_DIR)
print(f"Mapped {len(IMAGE_PATH_MAP)} images")

# Step 3: VERIFYING IMAGES USING MAP

print("\n" + "="*60)
print("VERIFYING IMAGES")
print("="*60)

train_with_images = train_df[train_df['id'].isin(IMAGE_PATH_MAP.keys())].copy()
test_with_images = test_df[test_df['id'].isin(IMAGE_PATH_MAP.keys())].copy()

print(f"\nTrain images:")
print(f"Found: {len(train_with_images)} / {len(train_df)}")
print(f"Missing: {len(train_df) - len(train_with_images)}")

print(f"\nTest images:")
print(f"Found: {len(test_with_images)} / {len(test_df)}")
print(f"Missing: {len(test_df) - len(test_with_images)}")

train_df = train_with_images
test_df = test_with_images

print(f"\nFiltered to properties with images")
print(f"Final train size: {len(train_df)}")
print(f"Final test size: {len(test_df)}")

# STEP 4: Handling missing values

print("\n" + "="*60)
print("HANDLING MISSING VALUES")
print("="*60)

missing_train = train_df.isnull().sum()
if missing_train.sum() > 0:
    print("\nMissing values in train:")
    print(missing_train[missing_train > 0])
else:
    print("No missing values!")

# Handle missing values
if 'view' in train_df.columns:
    train_df['view'].fillna(0, inplace=True)
    test_df['view'].fillna(0, inplace=True)

if 'sqft_basement' in train_df.columns:
    train_df['sqft_basement'].fillna(0, inplace=True)
    test_df['sqft_basement'].fillna(0, inplace=True)

if 'waterfront' in train_df.columns:
    train_df['waterfront'].fillna(0, inplace=True)
    test_df['waterfront'].fillna(0, inplace=True)

numeric_cols = train_df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if train_df[col].isnull().sum() > 0:
        median_val = train_df[col].median()
        train_df[col].fillna(median_val, inplace=True)
        if col in test_df.columns:
            test_df[col].fillna(median_val, inplace=True)

print("Missing values handled successfully.")

#Step 5: Feature Engineering

print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)

def create_features(df):

    if 'yr_built' in df.columns:
        df['age'] = 2024 - df['yr_built']
        df['age_squared'] = df['age'] ** 2

    if 'yr_renovated' in df.columns:
        df['is_renovated'] = (df['yr_renovated'] > 0).astype(int)
        df['years_since_renovation'] = np.where(
            df['yr_renovated'] > 0,
            2024 - df['yr_renovated'],
            df['age'] if 'age' in df.columns else 0
        )

    if 'sqft_living' in df.columns and 'sqft_lot' in df.columns:
        df['living_lot_ratio'] = df['sqft_living'] / (df['sqft_lot'] + 1)

    if 'sqft_above' in df.columns and 'sqft_living' in df.columns:
        df['above_living_ratio'] = df['sqft_above'] / (df['sqft_living'] + 1)

    if 'sqft_basement' in df.columns and 'sqft_living' in df.columns:
        df['has_basement'] = (df['sqft_basement'] > 0).astype(int)
        df['basement_ratio'] = df['sqft_basement'] / (df['sqft_living'] + 1)

    if 'bedrooms' in df.columns and 'bathrooms' in df.columns:
        df['bed_bath_ratio'] = df['bedrooms'] / (df['bathrooms'] + 0.5)
        df['total_rooms'] = df['bedrooms'] + df['bathrooms']

    if 'bedrooms' in df.columns and 'sqft_living' in df.columns:
        df['sqft_per_bedroom'] = df['sqft_living'] / (df['bedrooms'] + 1)

    if 'sqft_living15' in df.columns and 'sqft_living' in df.columns:
        df['living_vs_neighbors'] = df['sqft_living'] / (df['sqft_living15'] + 1)

    if 'sqft_lot15' in df.columns and 'sqft_lot' in df.columns:
        df['lot_vs_neighbors'] = df['sqft_lot'] / (df['sqft_lot15'] + 1)

    if 'grade' in df.columns and 'condition' in df.columns:
        df['quality_score'] = df['grade'] * df['condition']

    if 'waterfront' in df.columns and 'view' in df.columns:
        df['is_premium'] = ((df['waterfront'] == 1) | (df['view'] >= 3)).astype(int)

    if 'grade' in df.columns:
        df['is_luxury'] = (df['grade'] >= 11).astype(int)

    if 'lat' in df.columns and 'long' in df.columns:
        mean_lat = df['lat'].mean()
        mean_long = df['long'].mean()
        df['distance_from_center'] = np.sqrt(
            (df['lat'] - mean_lat)**2 + (df['long'] - mean_long)**2
        )

    if 'floors' in df.columns:
        df['is_multi_floor'] = (df['floors'] > 1).astype(int)

    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

print(f"Features engineered successfully.")
print(f"New train shape: {train_df.shape}")
print(f"New test shape: {test_df.shape}")

# step6 : Handling Outliers

print("\n" + "="*60)
print("HANDLING OUTLIERS")
print("="*60)

def remove_outliers(df, column, n_std=3):
    mean = df[column].mean()
    std = df[column].std()
    df_filtered = df[(df[column] >= mean - n_std*std) & (df[column] <= mean + n_std*std)]
    return df_filtered

original_size = len(train_df)
train_df = remove_outliers(train_df, 'price', n_std=3)
print(f"Removed {original_size - len(train_df)} price outliers")

train_df = remove_outliers(train_df, 'sqft_living', n_std=3)
print(f"Final train shape: {train_df.shape}")

#Step7: Feature Selection

print("\n" + "="*60)
print("FEATURE SELECTION")
print("="*60)

exclude_cols = ['id', 'price', 'date', 'yr_built', 'yr_renovated', 'price_per_sqft']
all_cols = train_df.select_dtypes(include=[np.number]).columns
feature_cols = [col for col in all_cols if col not in exclude_cols]
feature_cols = [col for col in feature_cols if col in test_df.columns]

print(f"Selected {len(feature_cols)} features")

X_train_full = train_df[feature_cols].copy()
y_train_full = train_df['price'].copy()
X_test = test_df[feature_cols].copy()

train_full_property_ids = train_df['id'].tolist()
test_property_ids = test_df['id'].tolist()

#Step8: Training-Validation split(80:20)


print("\n" + "="*60)
print("TRAIN-VALIDATION SPLIT (80:20)")
print("="*60)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(f"Training set: {len(X_train)} ({len(X_train)/len(X_train_full)*100:.1f}%)")
print(f"Validation set: {len(X_val)} ({len(X_val)/len(X_train_full)*100:.1f}%)")

train_property_ids = train_df.loc[X_train.index, 'id'].tolist()
val_property_ids = train_df.loc[X_val.index, 'id'].tolist()

print(f"Sample train IDs: {train_property_ids[:5]}")
print(f"Sample val IDs: {val_property_ids[:5]}")


#Step 9: Feature Scaling

print("\n" + "="*60)
print("FEATURE SCALING")
print("="*60)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=feature_cols, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_cols, index=X_test.index)

print("Features scaled using StandardScaler")

scaler_path = os.path.join(PROJECT_DIR, 'scaler.pkl')
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved")


#Step10: Image Preprocessing

print("\n" + "="*60)
print("IMAGE PREPROCESSING SETUP")
print("="*60)

IMG_SIZE = 224

train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print(f"Image size: {IMG_SIZE}x{IMG_SIZE}")
print("Transforms defined")

#Step11: Saving Preprocessing data with Image Map

print("\n" + "="*60)
print("SAVING PREPROCESSED DATA")
print("="*60)

data_dict = {
    'X_train': X_train_scaled,
    'y_train': y_train,
    'X_val': X_val_scaled,
    'y_val': y_val,
    'X_test': X_test_scaled,
    'feature_cols': feature_cols,
    'train_property_ids': train_property_ids,
    'val_property_ids': val_property_ids,
    'test_property_ids': test_property_ids,
    'IMAGE_PATH_MAP': IMAGE_PATH_MAP,
    'train_transform': train_transform,
    'val_test_transform': val_test_transform,
    'IMAGE_DIR': IMAGE_DIR,
    'PROJECT_DIR': PROJECT_DIR,
    'IMG_SIZE': IMG_SIZE
}

preprocessed_path = os.path.join(PROJECT_DIR, 'preprocessed_data.pkl')
with open(preprocessed_path, 'wb') as f:
    pickle.dump(data_dict, f)

print("Preprocessed data saved!")
print(f"Location: {preprocessed_path}")

#Step12: Generating Preprocessing summary

print("\n" + "="*60)
print("PREPROCESSING SUMMARY")
print("="*60)

print(f"\n Dataset Statistics:")
print(f"  - Training samples: {len(X_train_scaled)}")
print(f"  - Validation samples: {len(X_val_scaled)}")
print(f"  - Test samples: {len(X_test_scaled)}")
print(f"  - Features: {len(feature_cols)}")
print(f"  - Images mapped: {len(IMAGE_PATH_MAP)}")

print(f"\n Price Statistics:")
print(f"  - Mean: ${y_train.mean():,.2f}")
print(f"  - Median: ${y_train.median():,.2f}")

print("\n PREPROCESSING COMPLETE!")

KeyboardInterrupt: 