In [2]:
#HouseTrack: Predicting Mumbai House Prices
# Objective
# This step-by-step guide walks you through building a machine learning model to
# predict house prices in Mumbai. We'll use real-world housing data, clean and encode
# it, and build a regression model to estimate prices.
# By the end of this project, you’ll understand how to:
# ● Load and preprocess real estate data
# ● Normalize price values for consistenc
# ● Encode categorical features
# ● Build and evaluate a Linear Regression model

In [None]:
# Project Overview
# In this guide, you’ll build a machine learning pipeline to predict Mumbai house prices.
# You’ll learn how to:
# ● Handle price units (Lakhs and Crores)
# ● Drop irrelevant or redundant columns
# ● Apply label encoding to categorical features
# ● Train a Linear Regression model
# ● Evaluate model performance using Mean Squared Error and R² Score

In [None]:
# ➢Step 1: Import the Required Libraries
# ➢Step 2: Load the Dataset
# ➢Step 3: Normalize House Prices
# ➢Step 4: Encode Categorical Features
# ➢Step 5: Split Dataset into Train-Test
# ➢Step 6: Train a Linear Regression Model
# ➢Step 7: Evaluate Model Performance

In [3]:
# Step 1: Importing Required Libraries
# Start by importing the necessary Python packages.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [7]:
# Step 2 : Load the Dataset
# Load the dataset containing property listings and prices in Mumbai.
df = pd.read_csv(r"C:\Users\patha\Downloads\Mumbai+House+Prices.csv")
df.head()


Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


In [10]:
#Step 3: Normalize House Prices
# Convert all prices to lakhs for consistency across records.
df['price_lakhs'] = df.apply(lambda row: row['price'] * 100 if row['price_unit'] == 'Cr'
else row['price'], axis=1)

In [11]:
# This ensures prices are comparable regardless of original units.
# Next, drop columns that won't help in prediction.
df.drop(columns=['price', 'price_unit', 'locality'], inplace=True)

In [12]:
# Step 4: Encode Categorical Features
# Convert non-numeric columns to numeric using label encoding
label_cols = ['type', 'region', 'status', 'age']
for col in label_cols:
 df[col] = LabelEncoder().fit_transform(df[col])
# This makes the data ML-friendly without creating dummy variables.

In [13]:
# Step 5: Split Dataset into Train-Test
# Separate features from the target and split the data.
X = df.drop(columns='price_lakhs')
y = df['price_lakhs']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
# 80% training, 20% testing gives a fair evaluation baseline.

In [15]:
# Step 6: Train a Linear Regression Model
# Now let’s train a simple linear regression model.
model = LinearRegression()
model.fit(X_train, y_train)
# Linear Regression helps us understand the relationship between features and price.

In [17]:
# Step 8: Predicting using Linear Regression Model
# We use Logistic Regression to build a simple linear model and evaluate its
# performance.
model.predict(X_train)
model.predict(X_test)

array([ 65.91880637, 363.92525279, 118.24455433, ..., 695.79134869,
       167.1947157 ,  86.77061565])

In [18]:
# Step 7: Evaluate Model Performance
# Let’s predict and assess how well our model performs.
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
# Lower MSE and higher R² indicate better model accuracy.


Mean Squared Error: 18834.07
R² Score: 0.59


In [None]:
# Common Mistakes to Avoid
# ● Forgetting to normalize different price units
# ● Dropping useful features inadvertently
# ● Not checking model assumptions (e.g., linearity)
# Mini Challenge
# ● Try these to enhance your understanding:
# ● Visualize predicted vs. actual prices
# ● Plot feature importance (based on regression coefficients)
# ● Test with more advanced regressors like Decision Tree or Random Forest
# Let’s Summarize What We Have Learned
# In this project, you’ve built a basic yet functional house price prediction system. Here's
# a recap:
# ● Normalized prices from mixed units
# ● Cleaned and encoded relevant data
# ● Trained and evaluated a Linear Regression model
# ● Analyzed model performance with MSE and R² metrics
# You're now equipped to build regression models for real-world property pricing!