# House Price Prediction Model - Indian Cities

This notebook trains a Gradient Boosting model to predict house prices in major Indian cities.

## Features Used
- City (encoded)
- Area (sq ft)
- Number of Bedrooms
- Number of Bathrooms
- Age of property (years)
- Location Rating (1-5)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

## Generate Synthetic Data

In [None]:
np.random.seed(42)
n_samples = 2000

cities = ["mumbai", "delhi", "bangalore", "chennai", "hyderabad", "pune", "kolkata", "ahmedabad"]
city_prices = {
    "mumbai": 25000, "delhi": 18000, "bangalore": 22000, "chennai": 15000,
    "hyderabad": 14000, "pune": 12000, "kolkata": 10000, "ahmedabad": 9000
}

house_data = []
for city in cities:
    base_price = city_prices[city]
    for _ in range(n_samples // len(cities)):
        area = np.random.uniform(500, 5000)
        bedrooms = np.random.randint(1, 6)
        bathrooms = np.random.randint(1, 4)
        age = np.random.randint(0, 30)
        location_rating = np.random.uniform(1, 5)
        
        price = (base_price * area * 
                 (1 + 0.1 * bedrooms + 0.05 * bathrooms) *
                 (1 - 0.01 * age) *
                 (0.8 + 0.1 * location_rating))
        price += np.random.normal(0, price * 0.1)
        
        house_data.append({
            "city": city,
            "area_sqft": area,
            "bedrooms": bedrooms,
            "bathrooms": bathrooms,
            "age": age,
            "location_rating": location_rating,
            "price": price
        })

df = pd.DataFrame(house_data)
city_map = {city: i for i, city in enumerate(cities)}
df["city_encoded"] = df["city"].map(city_map)
df.head()

## Train Model

In [None]:
X = df[["city_encoded", "area_sqft", "bedrooms", "bathrooms", "age", "location_rating"]]
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = GradientBoostingRegressor(n_estimators=200, max_depth=6, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Model Performance:")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: ₹{np.sqrt(mean_squared_error(y_test, y_pred)):,.0f}")
print(f"MAE: ₹{mean_absolute_error(y_test, y_pred):,.0f}")

## Feature Importance

In [None]:
feature_names = ["City", "Area (sqft)", "Bedrooms", "Bathrooms", "Age", "Location Rating"]
importance = model.feature_importances_

plt.figure(figsize=(10, 6))
sns.barplot(x=importance, y=feature_names, palette="viridis")
plt.title("Feature Importance - House Price Prediction")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()