In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:

pd.set_option('display.max_columns', None)

In [36]:

df = pd.read_csv(r'E:\Mumbai Flat Real Estate Intelligence\datasets\mumbai_prperties_outlier_treated.csv')

In [37]:
df['major_location'].value_counts().shape

(115,)

### most missing value is already handle in feature engg

In [38]:
df.isnull().sum()

price               1
bedrooms            0
bathrooms           0
balcony             0
property_age       17
major_location      0
built_up_area       0
furnishing_type     0
dtype: int64

In [39]:
df.sample()

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
8898,7.7,3,3,2,recent construction,Bandra East,1850.0,Furnished


In [40]:
df[df.duplicated(keep=False)]

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type


In [41]:
df.drop_duplicates(inplace=True)

In [42]:
df.isnull().sum()

price               1
bedrooms            0
bathrooms           0
balcony             0
property_age       17
major_location      0
built_up_area       0
furnishing_type     0
dtype: int64

In [43]:
df['property_age'].value_counts()

property_age
recent construction    3731
new construction       1983
modern property        1316
Relatively New          965
New Property            492
Old Property            356
Moderately Old          335
mid age property        310
Undefined               138
old propert               6
Name: count, dtype: int64

In [44]:
df['property_age'].fillna("Undefined", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['property_age'].fillna("Undefined", inplace=True)


In [45]:
def mode_based_imputation(row):
    if row['property_age'] == 'Undefined':
        mode_value = df[(df['major_location'] == row['major_location'])]['property_age'].mode()
        # If mode_value is empty (no mode found), return NaN, otherwise return the mode
        if not mode_value.empty:
            return mode_value.iloc[0] 
        else:
            return np.nan
    else:
        return row['property_age']

In [46]:
df['property_age'] = df.apply(mode_based_imputation,axis=1)


In [47]:
df['property_age'].value_counts()


property_age
recent construction    3748
new construction       1986
modern property        1316
Relatively New          989
New Property            496
Old Property            363
Moderately Old          349
mid age property        310
Undefined                86
old propert               6
Name: count, dtype: int64

In [48]:
def mode_based_imputation3(row):
    if row['property_age'] == 'Undefined':
        mode_value = df['property_age'].mode()
        # If mode_value is empty (no mode found), return NaN, otherwise return the mode
        if not mode_value.empty:
            return mode_value.iloc[0] 
        else:
            return np.nan
    else:
        return row['property_age']

In [49]:
df['property_age'] = df.apply(mode_based_imputation3,axis=1)

In [50]:
df['property_age'].value_counts()

property_age
recent construction    3834
new construction       1986
modern property        1316
Relatively New          989
New Property            496
Old Property            363
Moderately Old          349
mid age property        310
old propert               6
Name: count, dtype: int64

In [51]:
df['property_age'] = df['property_age'].replace({
    'old propert': 'Old Property',
    'mid age property': 'Mid Age Property'
})


In [52]:
df['property_age'] = df['property_age'].str.lower().str.strip()


In [53]:
mapping = {
    'recent construction': 'New_property',
    'new construction': 'New_property',
    'new property': 'New_property',

    'relatively new': 'Relatively New',
    'modern property': 'Relatively New',

    'moderately old': 'Mid Age',
    'mid age property': 'Mid Age',

    'old property': 'Old',

    'undefined': 'Undefined'
}


In [54]:
df['property_age'] = df['property_age'].map(mapping)


In [55]:
df['property_age'].value_counts()

property_age
New_property      6316
Relatively New    2305
Mid Age            659
Old                369
Name: count, dtype: int64

In [56]:
df.isnull().sum()

price              1
bedrooms           0
bathrooms          0
balcony            0
property_age       0
major_location     0
built_up_area      0
furnishing_type    0
dtype: int64

In [57]:
df[df['price'].isnull()]

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
9648,,3,4,2,Relatively New,Malabar Hill,2375.0,Unfurnished


In [59]:
df.drop(9648, inplace=True)
df.reset_index(drop=True, inplace=True)


In [60]:
df.shape


(9648, 8)

In [61]:
df.drop_duplicates(inplace=True)

In [62]:
df.shape

(9615, 8)

In [63]:
df[df['price']<0.2]

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
0,0.08,1,1,0,New_property,Palghar,540.0,Furnished
1,0.17,1,1,2,New_property,Nalasopara West,550.0,Furnished
2,0.18,1,1,0,New_property,others,665.0,Furnished
3,0.18,1,1,2,New_property,Palghar,630.0,Furnished
4,0.18,1,1,2,New_property,Palghar,630.0,Unfurnished
5,0.18,1,2,1,New_property,Palghar,630.0,Furnished
6,0.19,1,1,2,New_property,Palghar,630.0,Unfurnished


In [64]:
df.drop(0, inplace=True)
df.reset_index(drop=True, inplace=True)


In [65]:
df['furnishing_type'].value_counts()

furnishing_type
Furnished      5453
Unfurnished    4161
Name: count, dtype: int64

In [66]:
df.to_csv('mumbai_properties_missing_value_imputation.csv',index=False)
