In [None]:
import pandas as pd

df = pd.read_csv('house_prices.csv')
df.head()

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")



# STEP 1: Load dataset
df = pd.read_csv('house_prices.csv')

# STEP 2: Parse price column
def parse_amount(x):
    if pd.isna(x): return np.nan
    x = str(x).lower().replace(',', '').strip()
    if 'cr' in x:
        return float(re.findall(r'[\d.]+', x)[0]) * 1e7
    if 'lac' in x or 'lakh' in x:
        return float(re.findall(r'[\d.]+', x)[0]) * 1e5
    try:
        return float(x)
    except:
        return np.nan

df['price'] = df['Amount(in rupees)'].apply(parse_amount)

# Remove extreme outliers
q_low = df['price'].quantile(0.01)
q_high = df['price'].quantile(0.99)
df = df[(df['price'] >= q_low) & (df['price'] <= q_high)]




In [None]:

# STEP 3: Extract numeric features
df['carpet_area_sqft'] = pd.to_numeric(df['Carpet Area'].astype(str).str.replace('[^0-9.]','',regex=True), errors='coerce')
df['bhk'] = pd.to_numeric(df['Title'].str.extract(r'(\d+)\s*BHK', expand=False), errors='coerce')

floor_split = df['Floor'].str.extract(r'(\d+)\s*out\s*of\s*(\d+)', expand=False)
df['floor_current'] = pd.to_numeric(floor_split[0], errors='coerce')
df['floor_total'] = pd.to_numeric(floor_split[1], errors='coerce')

df['Bathroom'] = pd.to_numeric(df['Bathroom'], errors='coerce')
df['Balcony'] = pd.to_numeric(df['Balcony'], errors='coerce')
df['Super Area'] = pd.to_numeric(df['Super Area'], errors='coerce')
df['Plot Area'] = pd.to_numeric(df['Plot Area'], errors='coerce')

df['has_parking'] = df['Car Parking'].apply(lambda x: 1 if pd.notna(x) and ('covered' in str(x).lower() or 'open' in str(x).lower() or str(x).strip().isdigit()) else 0)


# STEP 4: Handle categorical features
cat_cols = ['location', 'Status', 'Transaction', 'Furnishing', 'facing', 'Ownership', 'overlooking', 'Society']

for c in cat_cols:
    df[c] = df[c].astype(str).str.strip().replace('nan','Unknown')

# Reduce rare categories to 'Other' for speed
for c in ['location', 'Society']:
    top_values = df[c].value_counts().nlargest(50).index
    df[c] = df[c].apply(lambda x: x if x in top_values else 'Other')

# STEP 5: Drop rows with essential missing values
df = df.dropna(subset=['price','carpet_area_sqft','bhk','floor_current','floor_total','Bathroom'])

# STEP 6: Prepare features and target
features = ['carpet_area_sqft','bhk','floor_current','floor_total','Bathroom','Balcony','has_parking','Super Area','Plot Area'] + cat_cols
X = df[features]
y = df['price']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 7: Fill NaNs and convert categorical

# Numeric columns
num_cols = X_train.select_dtypes(include=['float64','int64']).columns
X_train[num_cols] = X_train[num_cols].fillna(0)
X_test[num_cols] = X_test[num_cols].fillna(0)

# Categorical columns
X_train[cat_cols] = X_train[cat_cols].fillna('Unknown').astype('category')
X_test[cat_cols] = X_test[cat_cols].fillna('Unknown').astype('category')

# STEP 8: Define Models
models = {
    "LightGBM": LGBMRegressor(
        objective='regression',
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1,
        random_state=42
    ),
    "RandomForest": RandomForestRegressor(
        n_estimators=200,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),
    "XGBoost": XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
}

# STEP 9: Train, Predict & Evaluate Each Model
results = []

for name, model in models.items():
    print(f"\n Training {name}...")

    # LightGBM can handle categorical directly, others need encoding
    if name == "LightGBM":
        model.fit(X_train, y_train, categorical_feature=cat_cols)
    else:
        X_train_enc = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
        X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)
        X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0)
        model.fit(X_train_enc, y_train)

    y_pred = model.predict(X_test if name == "LightGBM" else X_test_enc)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results.append([name, mae, rmse, r2])
    print(f" {name} - MAE: ₹{mae:,.0f}, RMSE: ₹{rmse:,.0f}, R²: {r2:.4f}")


# STEP 10: Compare Model Results
results_df = pd.DataFrame(results, columns=["Model", "MAE", "RMSE", "R²"])
print("\n Model Comparison:")
print(results_df.sort_values(by="R²", ascending=False).reset_index(drop=True))

# STEP 11: Feature Importances
print("\n Feature Importances Across Models:\n")

# LightGBM
lgb_importances = pd.Series(
    models["LightGBM"].feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print("LightGBM Top 10 Features:")
print(lgb_importances.head(10))
print("-" * 60)

# Random Forest (trained on encoded data)
rf = models["RandomForest"]
rf_importances = pd.Series(
    rf.feature_importances_,
    index=pd.get_dummies(X_train, columns=cat_cols, drop_first=True).columns
).sort_values(ascending=False)

print(" Random Forest Top 10 Features:")
print(rf_importances.head(10))
print("-" * 60)

# XGBoost (trained on encoded data)
xgb = models["XGBoost"]
xgb_importances = pd.Series(
    xgb.feature_importances_,
    index=pd.get_dummies(X_train, columns=cat_cols, drop_first=True).columns
).sort_values(ascending=False)

print(" XGBoost Top 10 Features:")
print(xgb_importances.head(10))


In [None]:
def predict_house_price(model, house_features, X_columns, cat_cols):
    """
    Predict house price for a new house.

    Parameters:
    - model: trained LightGBM model
    - house_features: dict with keys for each feature
    - X_columns: full list of model columns
    - cat_cols: list of categorical columns

    Example of house_features:
    house_features = {
        'carpet_area_sqft': 750,
        'bhk': 2,
        'floor_current': 5,
        'floor_total': 10,
        'Bathroom': 2,
        'Balcony': 1,
        'has_parking': 1,
        'Super Area': 850,
        'Plot Area': 0,
        'location': 'thane',
        'Status': 'Ready to Move',
        'Transaction': 'Resale',
        'Furnishing': 'Semi-Furnished',
        'facing': 'East',
        'Ownership': 'Freehold',
        'overlooking': 'Garden/Park',
        'Society': 'Dosti Vihar'
    }
    """

    # Create a DataFrame with one row
    df_new = pd.DataFrame([house_features])

    # Fill missing categorical values with 'Unknown' and convert to category
    for c in cat_cols:
        if c not in df_new.columns or pd.isna(df_new[c][0]):
            df_new[c] = 'Unknown'
        df_new[c] = df_new[c].astype('category')

    # Fill numeric columns with 0 if missing
    num_cols = [c for c in house_features if c not in cat_cols]
    for c in num_cols:
        if c not in df_new.columns or pd.isna(df_new[c][0]):
            df_new[c] = 0
        df_new[c] = pd.to_numeric(df_new[c], errors='coerce').fillna(0)

    # Ensure the order of columns matches the training data
    for col in X_columns:
        if col not in df_new.columns:
            if col in cat_cols:
                df_new[col] = 'Unknown'
                df_new[col] = df_new[col].astype('category')
            else:
                df_new[col] = 0

    df_new = df_new[X_columns]

    # Predict price
    predicted_price = model.predict(df_new)[0]
    return predicted_price

# Example:

house_features = {
    'carpet_area_sqft': 750,
    'bhk': 2,
    'floor_current': 5,
    'floor_total': 10,
    'Bathroom': 2,
    'Balcony': 1,
    'has_parking': 1,
    'Super Area': 850,
    'Plot Area': 0,
    'location': 'thane',
    'Status': 'Ready to Move',
    'Transaction': 'Resale',
    'Furnishing': 'Semi-Furnished',
    'facing': 'East',
    'Ownership': 'Freehold',
    'overlooking': 'Garden/Park',
    'Society': 'Dosti Vihar'
}

predicted_price = predict_house_price(models["LightGBM"], house_features, X_train.columns, cat_cols)
print("Predicted Price: ₹{:,.0f}".format(predicted_price))
