# Feature Engineering

### IMPORTS

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

print("Libraries Imported Successfully")

Libraries Imported Successfully


### LOAD DATASET

In [3]:
try:
    df = pd.read_csv("../dataset/india_housing_prices.csv")
    print(f" Dataset Loaded: {df.shape[0]} rows, {df.shape[1]} columns")
except Exception as e:
    print(" Error loading dataset:", e)

 Dataset Loaded: 250000 rows, 23 columns


### FEATURE ENGINEERING STARTS

In [4]:
print("\n============================")
print("üîß FEATURE ENGINEERING BEGIN")
print("============================\n")


üîß FEATURE ENGINEERING BEGIN



In [5]:
# ------------------------------------------------------------
#  1.PRICE NORMALIZATION
# ------------------------------------------------------------
print("Creating Normalized Price Features...")

# Price per BHK
df["Price_per_BHK"] = df["Price_in_Lakhs"] / df["BHK"]

# Price per 100 SqFt (more stable)
df["Price_per_100SqFt"] = (df["Price_in_Lakhs"] / df["Size_in_SqFt"]) * 100

print("Price Normalization Features Added")

Creating Normalized Price Features...
Price Normalization Features Added


In [6]:
# ------------------------------------------------------------
#  2.SIZE NORMALIZATION
# ------------------------------------------------------------
print(" Creating Size Normalized Features...")

# Carpet area approx = 70% of built-up area
df["Carpet_Area"] = df["Size_in_SqFt"] * 0.70

# BHK per 1000 SqFt ‚Üí indicates spaciousness
df["BHK_per_1000SqFt"] = (df["BHK"] / df["Size_in_SqFt"]) * 1000

# Size bucket segmentation
df["Size_Bucket"] = pd.cut(
    df["Size_in_SqFt"],
    bins=[0, 800, 1500, 2500, 4000, 6000],
    labels=["Small", "Medium", "Large", "XL", "XXL"]
)

print("Size Engineering Features Added")

 Creating Size Normalized Features...
Size Engineering Features Added


In [7]:
# ------------------------------------------------------------
#  3.PROPERTY AGE FEATURE ENGINEERING
# ------------------------------------------------------------
print("Creating Age-Based Features...")

df["Age_Bucket"] = pd.cut(
    df["Age_of_Property"],
    bins=[0, 5, 15, 30, 100],
    labels=["New", "Mid_Age", "Old", "Very_Old"]
)

# Is New Construction?
df["Is_New_Construction"] = np.where(df["Availability_Status"] == "Under_Construction", 1, 0)

print("Age Features Added")


Creating Age-Based Features...
Age Features Added


In [8]:
# ------------------------------------------------------------
# 4. AMENITIES ENGINEERING
# ------------------------------------------------------------
print("Processing Amenities Column...")

# Count of amenities
df["Amenity_Count"] = df["Amenities"].apply(lambda x: len(str(x).split(",")))

# Extract key amenities
key_amens = ["Pool", "Gym", "Garden", "Clubhouse", "Playground", "Security"]

def has_amenity(x, amen):
    return 1 if amen.lower() in str(x).lower() else 0

for amen in key_amens:
    col_name = f"Amen_{amen}"
    df[col_name] = df["Amenities"].apply(lambda x: has_amenity(x, amen))

print("Amenity Engineering Complete")

Processing Amenities Column...
Amenity Engineering Complete


In [9]:
# ------------------------------------------------------------
# 5Ô∏è LOCATION TIER ENGINEERING (REAL ESTATE DOMAIN KNOWLEDGE)
# ------------------------------------------------------------
print("Creating City Tier (Metro / Tier-1 / Tier-2 / Tier-3)...")

metro_cities = ["Mumbai", "Delhi", "Kolkata", "Chennai", "Bangalore", "Hyderabad", "Pune"]
tier1_cities = ["Ahmedabad", "Jaipur", "Surat", "Lucknow", "Kanpur", "Nagpur"]
tier2_cities = ["Bhopal", "Vadodara", "Indore", "Ludhiana", "Coimbatore", "Kochi"]
# remaining ‚Üí Tier-3

def classify_tier(city):
    if city in metro_cities:
        return "Metro"
    elif city in tier1_cities:
        return "Tier_1"
    elif city in tier2_cities:
        return "Tier_2"
    else:
        return "Tier_3"

df["City_Tier"] = df["City"].apply(classify_tier)

print("Location Tier Feature Added")

Creating City Tier (Metro / Tier-1 / Tier-2 / Tier-3)...
Location Tier Feature Added


In [10]:
# ------------------------------------------------------------
# 6Ô∏è ENCODING TRANSPORT ACCESSIBILITY
# ------------------------------------------------------------
print("Creating Transport Accessibility Score...")

transport_map = {"High": 3, "Medium": 2, "Low": 1}
df["Transport_Score"] = df["Public_Transport_Accessibility"].map(transport_map)

print("Transport Score Added")

Creating Transport Accessibility Score...
Transport Score Added


In [11]:
# ------------------------------------------------------------
# 7Ô∏è FURNISHED STATUS ENCODING
# ------------------------------------------------------------
print("Encoding Furnished Status...")

furnish_map = {"Unfurnished": 0, "Semi-Furnished": 1, "Furnished": 2}
df["Furnish_Score"] = df["Furnished_Status"].map(furnish_map)

print("Furnishing Score Added")

Encoding Furnished Status...
Furnishing Score Added


In [12]:
# ------------------------------------------------------------
# 8Ô∏è FACING DIRECTION FEATURE
# ------------------------------------------------------------
print("Engineering Facing Feature...")

facing_map = {"East": 4, "North": 3, "West": 2, "South": 1}
df["Facing_Score"] = df["Facing"].map(facing_map)

print("Facing Direction Added")

Engineering Facing Feature...
Facing Direction Added


In [13]:
# ------------------------------------------------------------
# 9Ô∏è OWNERSHIP FEATURE
# ------------------------------------------------------------
print("Owner Type Encoding (Builder = 2 ‚Üí Premium Projects)...")

owner_map = {"Owner": 1, "Broker": 0, "Builder": 2}
df["Owner_Score"] = df["Owner_Type"].map(owner_map)

print("Ownership Features Added")

Owner Type Encoding (Builder = 2 ‚Üí Premium Projects)...
Ownership Features Added


In [14]:
# ------------------------------------------------------------
# 10 AVAILABILITY STATUS
# ------------------------------------------------------------
print("Availability Feature...")

avail_map = {"Ready_to_Move": 1, "Under_Construction": 0}
df["Availability_Flag"] = df["Availability_Status"].map(avail_map)

print("Availability Feature Added")

Availability Feature...
Availability Feature Added


In [15]:
# ============================================================
# FINAL DATASET SUMMARY
# ============================================================

print("\n============================")
print("FINAL FEATURE ENGINEERING SUMMARY")
print("============================")

print("New Columns Added:")
for col in df.columns[-25:]:   # last few engineered features
    print("  -", col)

print("\nFinal Dataset Shape:", df.shape)


FINAL FEATURE ENGINEERING SUMMARY
New Columns Added:
  - Security
  - Amenities
  - Facing
  - Owner_Type
  - Availability_Status
  - Price_per_BHK
  - Price_per_100SqFt
  - Carpet_Area
  - BHK_per_1000SqFt
  - Size_Bucket
  - Age_Bucket
  - Is_New_Construction
  - Amenity_Count
  - Amen_Pool
  - Amen_Gym
  - Amen_Garden
  - Amen_Clubhouse
  - Amen_Playground
  - Amen_Security
  - City_Tier
  - Transport_Score
  - Furnish_Score
  - Facing_Score
  - Owner_Score
  - Availability_Flag

Final Dataset Shape: (250000, 43)


In [16]:
# Save engineered dataset
df.to_csv("../dataset/processed_housing_data.csv", index=False)
print("\n Processed dataset saved successfully!")


 Processed dataset saved successfully!


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 43 columns):
 #   Column                          Non-Null Count   Dtype   
---  ------                          --------------   -----   
 0   ID                              250000 non-null  int64   
 1   State                           250000 non-null  object  
 2   City                            250000 non-null  object  
 3   Locality                        250000 non-null  object  
 4   Property_Type                   250000 non-null  object  
 5   BHK                             250000 non-null  int64   
 6   Size_in_SqFt                    250000 non-null  int64   
 7   Price_in_Lakhs                  250000 non-null  float64 
 8   Price_per_SqFt                  250000 non-null  float64 
 9   Year_Built                      250000 non-null  int64   
 10  Furnished_Status                250000 non-null  object  
 11  Floor_No                        250000 non-null  int64   
 12  To

## Conclusion
**WHAT THIS NOTEBOOK ACHIEVES**

We now have real-estate domain enriched features:
1) **Pricing features**

- Price_per_BHK

- Price_per_100SqFt

- Carpet_Area

2) **Property characteristics**

- Size bucket

- BHK_per_1000SqFt

- Age_Bucket

- Is_New_Construction

3) **Amenities engineering**

- Pool, Gym, Garden, Clubhouse, Playground, Security

- Amenity_Count

4) **Location intelligence**

- City_Tier (Metro / Tier-1 / Tier-2 / Tier-3)

- Transport_Score

5) **Facing direction score**
6) **Ownership credibility**
7) **Availability status**

**This adds a meaningful signal for ML, even on synthetic data.**