In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
df = pd.read_csv(r'E:\Mumbai Flat Real Estate Intelligence\datasets\mumbai_properties_missing_value_imputation.csv') 

In [3]:
df.shape

(9614, 8)

In [4]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
0,0.17,1,1,2,New_property,Nalasopara West,550.0,Furnished
1,0.18,1,1,0,New_property,others,665.0,Furnished
2,0.18,1,1,2,New_property,Palghar,630.0,Furnished
3,0.18,1,1,2,New_property,Palghar,630.0,Unfurnished
4,0.18,1,2,1,New_property,Palghar,630.0,Furnished


In [5]:
df['major_location'].unique()

array(['Nalasopara West', 'others', 'Palghar', 'Naigaon East',
       'Virar East', 'Virar West', 'Vasai East', 'Boisar',
       'Nalasopara East', 'Vasai West', 'Virar west', 'Chikhal Dongari',
       'Wadala', 'Goregaon East', 'Bolinj', 'Thane West',
       'Mira Road East', 'Parel', 'Dahisar East', 'Kandivali West',
       'Malad West', 'Mira Road', 'Chembur', 'Ghatkopar East',
       'Kandivali East', 'Kurla West', 'Bhayandar East', 'Byculla',
       'Lower Parel', 'Bhandup West', 'Shanti Park', 'Bhayandar West',
       'Govandi', 'Kashimira', 'Mulund West', 'Jogeshwari East',
       'Santacruz East', 'Vikhroli East', 'Ramdev Park',
       'Kanjurmarg East', 'Dahisar West', 'Bandra East', 'Dadar East',
       'Vikhroli West', 'Jogeshwari West', 'Kurla East', 'Ghatkopar West',
       'Andheri West', 'Goregaon West', 'Borivali West', 'Malad East',
       'Borivali East', 'Mulund East', 'Powai', 'Bhandup East', 'Colaba',
       'Vile Parle East', 'Andheri East', 'Dahisar', 'Kherwadi',

In [6]:
mumbai_locations_coordinates = {
    # Western Suburbs
    'Andheri West': [19.1412, 72.8431],
    'Andheri East': [19.1136, 72.8697],
    'Bandra West': [19.0544, 72.8225],
    'Bandra East': [19.0644, 72.8455],
    'Santacruz West': [19.0861, 72.8344],
    'Santacruz East': [19.0817, 72.8428],
    'Vile Parle West': [19.0997, 72.8428],
    'Vile Parle East': [19.0958, 72.8508],
    'Goregaon West': [19.1572, 72.8417],
    'Goregaon East': [19.1672, 72.8547],
    'Goregaon': [19.1622, 72.8483],  # General Goregaon
    'Malad West': [19.1872, 72.8422],
    'Malad East': [19.1853, 72.8556],
    'Kandivali West': [19.2100, 72.8331],
    'Kandivali East': [19.2036, 72.8447],
    'Borivali West': [19.2292, 72.8463],
    'Borivali East': [19.2277, 72.8602],
    'Dahisar West': [19.2564, 72.8447],
    'Dahisar East': [19.2514, 72.8572],
    'Dahisar': [19.2539, 72.8510],  # General Dahisar
    'Jogeshwari West': [19.1350, 72.8367],
    'Jogeshwari East': [19.1386, 72.8511],
    'Juhu': [19.1075, 72.8263],
    'Juhu Scheme': [19.1075, 72.8263],  # Same as Juhu
    'Khar West': [19.0722, 72.8294],
    'Oshiwara': [19.1472, 72.8364],
    'Yari Road': [19.1214, 72.8325],
    'Versova': [19.1258, 72.8219],  # Near Yari Road
    'Madh': [19.1333, 72.7964],
    
    # Central Mumbai
    'Lower Parel': [18.9981, 72.8269],
    'Lower Parel West': [18.9981, 72.8225],
    'Lower Parel East': [18.9981, 72.8311],
    'Parel': [18.9967, 72.8372],
    'Dadar West': [19.0166, 72.8397],
    'Dadar East': [19.0186, 72.8475],
    'Matunga West': [19.0194, 72.8483],
    'Matunga East': [19.0236, 72.8567],
    'Byculla': [18.9758, 72.8356],
    'Mazgaon': [18.9700, 72.8408],
    'Worli': [19.0178, 72.8225],
    'Prabhadevi': [19.0175, 72.8308],
    'Mahim': [19.0350, 72.8425],
    'Sion': [19.0400, 72.8600],
    'Sion East': [19.0400, 72.8650],
    'Wadala': [19.0189, 72.8603],
    'Wadala East': [19.0200, 72.8650],
    'Sewri': [19.0019, 72.8572],
    
    # South Mumbai
    'Colaba': [18.9061, 72.8135],
    'Cuffe Parade': [18.9061, 72.8135],  # Same as Colaba area
    'Churchgate': [18.9336, 72.8258],
    'Marine Lines': [18.9400, 72.8250],
    'Malabar Hill': [18.9542, 72.7958],
    'Tardeo': [18.9667, 72.8083],
    'Girgaon': [18.9500, 72.8167],
    'Agripada': [18.9767, 72.8267],
    'Kamathipura': [18.9700, 72.8200],
    'Bhoiwada': [19.0067, 72.8458],
    'Mahalakshmi': [18.9800, 72.8100],
    'Hindu Colony': [19.0264, 72.8514],  # Dadar area
    'Chakala': [19.1128, 72.8667],
    'Midc Chakala Industrial Area': [19.1128, 72.8667],  # Same as Chakala
    'Mumbai Central': [18.9750, 72.8200],
    'Marine Drive': [18.9400, 72.8250],  # Near Marine Lines
    
    # Eastern Suburbs
    'Chembur': [19.0522, 72.9005],
    'Ghatkopar West': [19.0864, 72.8981],
    'Ghatkopar East': [19.0797, 72.9086],
    'Kurla West': [19.0667, 72.8778],
    'Kurla East': [19.0667, 72.8889],
    'Vikhroli West': [19.1103, 72.9267],
    'Vikhroli East': [19.1100, 72.9408],
    'Bhandup West': [19.1453, 72.9322],
    'Bhandup East': [19.1500, 72.9417],
    'Mulund West': [19.1678, 72.9411],
    'Mulund East': [19.1722, 72.9528],
    'Powai': [19.1167, 72.9047],
    'Sakinaka': [19.1050, 72.8950],
    'Kanjurmarg West': [19.1403, 72.9200],
    'Kanjurmarg East': [19.1403, 72.9300],
    'Govandi': [19.0581, 72.9133],
    'Deonar': [19.0500, 72.9167],
    'Tilak Nagar': [19.0500, 72.8700],  # Chembur area
    
    # Thane Region
    'Thane West': [19.2186, 72.9781],
    'Kashimira': [19.4667, 72.8167],  # Near Mira Road
    'Bhayandar East': [19.3058, 72.8511],
    'Bhayandar West': [19.2956, 72.8381],
    'Mira Road East': [19.2831, 72.8753],
    'Mira Road': [19.2831, 72.8753],  # Same as Mira Road East
    'Naigaon East': [19.3667, 72.8333],  # Near Vasai
    'Shanti Park': [19.2867, 72.8700],  # Mira Road area
    'Evershine Nagar': [19.2833, 72.8783],  # Mira Road area
    
    # Vasai-Virar Region
    'Vasai East': [19.3764, 72.8225],
    'Vasai West': [19.3831, 72.8089],
    'Virar East': [19.4514, 72.8089],
    'Virar West': [19.4558, 72.7947],
    'Virar west': [19.4558, 72.7947],  # Same as Virar West (correcting typo)
    'Nalasopara East': [19.4167, 72.8667],
    'Nalasopara West': [19.4167, 72.8500],
    'Boisar': [19.8000, 72.7500],
    'Palghar': [19.7000, 72.7667],
    'Bolinj': [19.3800, 72.8100],  # Vasai area
    'Chikhal Dongari': [19.3900, 72.8200],  # Vasai area
    'Kanchpada': [19.3900, 72.8300],  # Near Virar
    
    # Navi Mumbai and Extended Areas
    'Kherwadi': [19.0642, 72.8394],  # Bandra East area
    'Vakola': [19.0825, 72.8444],  # Santacruz East area
    'Ramdev Park': [19.0631, 72.8378],  # Bandra East area
    'Sher E Punjab Colony': [19.0675, 72.8422],  # Andheri East area
    'Pirojshanagar': [19.1200, 72.9150],  # Vikhroli area
    'Siddharth Nagar': [19.0200, 72.8550],  # Wadala area
    'Madanpura': [18.9600, 72.8250],  # Byculla area
    'Gandhi Nagar': [19.0300, 72.8500],  # Bandra East area
    'Parsi Colony': [19.0250, 72.8400],  # Dadar area
    'Kala Nagar': [19.1133, 72.8667],  # Andheri East area
    'Jvpd Scheme': [19.1133, 72.8314],  # Juhu area
    'Cumballa Hill': [18.9700, 72.8050],  # Malabar Hill area
    
    # Others and approximate locations
    'others': [19.0760, 72.8777],  # Mumbai center as default
    'Hmpl Surya Nagar': [19.2000, 72.8700],  # Approximate - likely in Western suburbs
    'Dattapada': [19.2333, 72.8500],  # Borivali area
}







In [7]:
# Convert to DataFrame for easy use
coordinates_df = pd.DataFrame([
    {'location': loc, 'latitude': coords[0], 'longitude': coords[1]}
    for loc, coords in mumbai_locations_coordinates.items()
])

In [8]:
print(f"Total locations with coordinates: {len(coordinates_df)}")
print("\nFirst 10 locations:")
print(coordinates_df.head(10))

Total locations with coordinates: 117

First 10 locations:
          location  latitude  longitude
0     Andheri West   19.1412    72.8431
1     Andheri East   19.1136    72.8697
2      Bandra West   19.0544    72.8225
3      Bandra East   19.0644    72.8455
4   Santacruz West   19.0861    72.8344
5   Santacruz East   19.0817    72.8428
6  Vile Parle West   19.0997    72.8428
7  Vile Parle East   19.0958    72.8508
8    Goregaon West   19.1572    72.8417
9    Goregaon East   19.1672    72.8547


In [9]:
# Clean location names in both DataFrames (remove extra spaces, standardize case)
df['major_location'] = df['major_location'].str.strip()
coordinates_df['location'] = coordinates_df['location'].str.strip()

In [10]:
# Merge the DataFrames
df_with_coords = pd.merge(
    df,
    coordinates_df,
    left_on='major_location',
    right_on='location',
    how='left'
)

In [11]:
# Check how many records got coordinates
print(f"Total records: {len(df_with_coords)}")
print(f"Records with coordinates: {df_with_coords['latitude'].notna().sum()}")
print(f"Records without coordinates: {df_with_coords['latitude'].isna().sum()}")


Total records: 9614
Records with coordinates: 9614
Records without coordinates: 0


In [12]:
print(df_with_coords[['major_location', 'price', 'latitude', 'longitude']].head(10))

    major_location  price  latitude  longitude
0  Nalasopara West   0.17   19.4167    72.8500
1           others   0.18   19.0760    72.8777
2          Palghar   0.18   19.7000    72.7667
3          Palghar   0.18   19.7000    72.7667
4          Palghar   0.18   19.7000    72.7667
5          Palghar   0.19   19.7000    72.7667
6     Naigaon East   0.20   19.3667    72.8333
7       Virar East   0.20   19.4514    72.8089
8           others   0.20   19.0760    72.8777
9  Nalasopara West   0.21   19.4167    72.8500


In [13]:
df_with_coords['longitude'].isnull().sum()

np.int64(0)

In [14]:
df_with_coords['price_per_sqft'] = round(df_with_coords['price']*10000000 / df_with_coords['built_up_area'])

In [15]:
df_with_coords

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type,location,latitude,longitude,price_per_sqft
0,0.17,1,1,2,New_property,Nalasopara West,550.0,Furnished,Nalasopara West,19.4167,72.8500,3091.0
1,0.18,1,1,0,New_property,others,665.0,Furnished,others,19.0760,72.8777,2707.0
2,0.18,1,1,2,New_property,Palghar,630.0,Furnished,Palghar,19.7000,72.7667,2857.0
3,0.18,1,1,2,New_property,Palghar,630.0,Unfurnished,Palghar,19.7000,72.7667,2857.0
4,0.18,1,2,1,New_property,Palghar,630.0,Furnished,Palghar,19.7000,72.7667,2857.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9609,27.00,5,5,0,Old,Malabar Hill,3699.0,Unfurnished,Malabar Hill,18.9542,72.7958,72993.0
9610,29.00,5,5,0,Mid Age,Malabar Hill,3181.0,Furnished,Malabar Hill,18.9542,72.7958,91166.0
9611,29.00,5,6,1,Old,Malabar Hill,2850.0,Furnished,Malabar Hill,18.9542,72.7958,101754.0
9612,29.00,6,4,3,Mid Age,Lower Parel,5042.0,Furnished,Lower Parel,18.9981,72.8269,57517.0


In [16]:
df_with_coords.dtypes


price              float64
bedrooms             int64
bathrooms            int64
balcony             object
property_age        object
major_location      object
built_up_area      float64
furnishing_type     object
location            object
latitude           float64
longitude          float64
price_per_sqft     float64
dtype: object

In [18]:
# If you just want to fix the error and continue, use this:
group_df = df_with_coords.groupby('major_location').agg({
    'price': lambda x: pd.to_numeric(x, errors='coerce').mean(),
    'price_per_sqft': lambda x: pd.to_numeric(x, errors='coerce').mean(),
    'built_up_area': lambda x: pd.to_numeric(x, errors='coerce').mean(),
    'latitude': lambda x: pd.to_numeric(x, errors='coerce').mean(),
    'longitude': lambda x: pd.to_numeric(x, errors='coerce').mean()
}).reset_index()

print("Aggregated data created successfully!")
print(group_df.head())

Aggregated data created successfully!
  major_location     price  price_per_sqft  built_up_area  latitude  longitude
0       Agripada  4.133000    28130.500000    1250.200000   18.9767    72.8267
1   Andheri East  2.177078    24118.799087     928.150685   19.1136    72.8697
2   Andheri West  4.068563    30507.363985    1305.848659   19.1412    72.8431
3    Bandra East  6.437523    43822.467890    1440.816514   19.0644    72.8455
4    Bandra West  7.098621    44431.431034    1646.327586   19.0544    72.8225


In [43]:
# If dots are still not visible, try this version:
fig = go.Figure()

# Add scattermapbox trace
fig.add_trace(go.Scattermapbox(
    lat=group_df['latitude'],
    lon=group_df['longitude'],
    mode='markers+text',
    marker=dict(
        size=group_df['built_up_area'] / group_df['built_up_area'].max() * 50 + 10,
        color=group_df['price_per_sqft'],
        colorscale='Plasma',
        showscale=True,
        colorbar=dict(title="Price per Sq Ft (₹)", thickness=20),
        opacity=0.8
    ),
    text=group_df['major_location'],
    hovertext=group_df.apply(
        lambda row: f"<b>{row['major_location']}</b><br>"
                   f"Price/sqft: ₹{row['price_per_sqft']:,.0f}<br>"
                   f"Avg Area: {row['built_up_area']:,.0f} sq.ft<br>"
                   f"Avg Price in cr: ₹{row['price']:,.0f}",
        axis=1
    ),
    hoverinfo='text'
))

# Update layout
fig.update_layout(
    mapbox_style="open-street-map",
    mapbox=dict(
        center=dict(lat=19.0760, lon=72.8777),
        zoom=10
    ),
    title="Mumbai Real Estate: Price per Square Foot",
    height=600,
    margin={"r":0,"t":50,"l":0,"b":0}
)

fig.show()


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [42]:
df_with_coords.to_csv('data viz.csv',index=False)

In [44]:
fig = px.scatter(df, x="built_up_area", y="price", color="bedrooms", title="Area Vs Price")

# Show the plot
fig.show()

In [45]:
fig = px.pie(df, names='bedrooms', title='Total Bill Amount by Day')

# Show the plot
fig.show()

In [47]:
temp_df = df[df['bedrooms'] <= 4]
# Create side-by-side boxplots of the total bill amounts by day
fig = px.box(temp_df, x='bedrooms', y='price', title='BHK Price Range')

# Show the plot
fig.show()