# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Loading Data

In [2]:
df = pd.read_csv("house_prices.csv")

# Data Analysis

In [3]:
print(df.shape)

(187531, 21)


In [4]:
features = df.columns
print(features)

Index(['Index', 'Title', 'Description', 'Amount(in rupees)',
       'Price (in rupees)', 'location', 'Carpet Area', 'Status', 'Floor',
       'Transaction', 'Furnishing', 'facing', 'overlooking', 'Society',
       'Bathroom', 'Balcony', 'Car Parking', 'Ownership', 'Super Area',
       'Dimensions', 'Plot Area'],
      dtype='object')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187531 entries, 0 to 187530
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Index              187531 non-null  int64  
 1   Title              187531 non-null  object 
 2   Description        184508 non-null  object 
 3   Amount(in rupees)  187531 non-null  object 
 4   Price (in rupees)  169866 non-null  float64
 5   location           187531 non-null  object 
 6   Carpet Area        106858 non-null  object 
 7   Status             186916 non-null  object 
 8   Floor              180454 non-null  object 
 9   Transaction        187448 non-null  object 
 10  Furnishing         184634 non-null  object 
 11  facing             117298 non-null  object 
 12  overlooking        106095 non-null  object 
 13  Society            77853 non-null   object 
 14  Bathroom           186703 non-null  object 
 15  Balcony            138596 non-null  object 
 16  Ca

Conclusion:
    - Dimensions and Plot Area are always empty in this dataset

In [6]:
df.head()

Unnamed: 0,Index,Title,Description,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,...,facing,overlooking,Society,Bathroom,Balcony,Car Parking,Ownership,Super Area,Dimensions,Plot Area
0,0,1 BHK Ready to Occupy Flat for sale in Srushti...,"Bhiwandi, Thane has an attractive 1 BHK Flat f...",42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,...,,,Srushti Siddhi Mangal Murti Complex,1,2.0,,,,,
1,1,2 BHK Ready to Occupy Flat for sale in Dosti V...,One can find this stunning 2 BHK flat for sale...,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,...,East,Garden/Park,Dosti Vihar,2,,1 Open,Freehold,,,
2,2,2 BHK Ready to Occupy Flat for sale in Sunrise...,Up for immediate sale is a 2 BHK apartment in ...,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,...,East,Garden/Park,Sunrise by Kalpataru,2,,1 Covered,Freehold,,,
3,3,1 BHK Ready to Occupy Flat for sale Kasheli,This beautiful 1 BHK Flat is available for sal...,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,...,,,,1,1.0,,,,,
4,4,2 BHK Ready to Occupy Flat for sale in TenX Ha...,"This lovely 2 BHK Flat in Pokhran Road, Thane ...",1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,...,West,"Garden/Park, Main Road",TenX Habitat Raymond Realty,2,,1 Covered,Co-operative Society,,,


In [7]:
df.isnull().sum()

Index                     0
Title                     0
Description            3023
Amount(in rupees)         0
Price (in rupees)     17665
location                  0
Carpet Area           80673
Status                  615
Floor                  7077
Transaction              83
Furnishing             2897
facing                70233
overlooking           81436
Society              109678
Bathroom                828
Balcony               48935
Car Parking          103357
Ownership             65517
Super Area           107685
Dimensions           187531
Plot Area            187531
dtype: int64

In [8]:
def null_avg():
    totals_rows = df.shape[0]
    avge_nulls = (df.isnull().sum() / totals_rows) * 100
    print(avge_nulls)
null_avg()

Index                  0.000000
Title                  0.000000
Description            1.612000
Amount(in rupees)      0.000000
Price (in rupees)      9.419776
location               0.000000
Carpet Area           43.018488
Status                 0.327946
Floor                  3.773776
Transaction            0.044259
Furnishing             1.544811
facing                37.451408
overlooking           43.425354
Society               58.485264
Bathroom               0.441527
Balcony               26.094352
Car Parking           55.114621
Ownership             34.936624
Super Area            57.422506
Dimensions           100.000000
Plot Area            100.000000
dtype: float64


Conclusion
    - Index, Title, Description are irrelevant

In [9]:
# df.value_counts() # result: Series([], Name: count, dtype: int64)

f = np.array(['Amount(in rupees)','Price (in rupees)', 'location', 'Carpet Area', 
    'Status', 'Floor', 'Transaction', 'Furnishing', 'facing', 'overlooking', 
    'Society', 'Bathroom', 'Balcony', 'Car Parking', 'Ownership', 'Super Area',
])

for feature in f:
    print(df[feature].value_counts())

Amount(in rupees)
Call for Price    9684
85 Lac            5264
65 Lac            4229
60 Lac            3869
70 Lac            3801
                  ... 
21.60 Cr             1
21.45 Cr             1
9.45 Cr              1
13.6 Lac             1
9.90 Cr              1
Name: count, Length: 1561, dtype: int64
Price (in rupees)
4000.0     2463
5000.0     2286
10000.0    2273
3200.0     1479
18000.0    1420
           ... 
8808.0        1
7864.0        1
10707.0       1
10680.0       1
2508.0        1
Name: count, Length: 10958, dtype: int64
location
new-delhi      27599
bangalore      24030
kolkata        22380
gurgaon        20070
ahmedabad      12750
               ...  
nellore           30
solapur           30
madurai           30
palakkad          30
pondicherry       30
Name: count, Length: 81, dtype: int64
Carpet Area
1000 sqft    5285
900 sqft     4649
1300 sqft    3457
1600 sqft    2752
600 sqft     2225
             ... 
1867 sqft       1
2415 sqft       1
1712 sqft       1
24

#### Categorical Features
- *Location*
- *Transaction*
- *Furnishing*
- *Facing*
- *Overlooking*
- *Bathroom*
- *Balcony*
- *Ownership*

#### Continuous Features
- *Amount (in rupees)*
- *Price (in rupees)*
- *Carpet Area*
- *Floor*
- *Society*
- *Car Parking*
- *Super Area*



# Feature Selection

In [10]:
df.drop(columns=['Title', 'Description', 'Index', 'Dimensions', 'Plot Area', 'Status', 'Society', 'Car Parking', 'Super Area','overlooking', 'Carpet Area'], axis=1, inplace=True) 

# df.drop(columns=['Society', 'Car Parking', 'Super Area', 'Dimensions', 'Plot Area','Index',
#                  'overlooking','Carpet Area','Title','Description'], inplace=True)

continuous_features = np.array([
    "Amount(in rupees)",
    "Price (in rupees)",
    "Floor",
])

categorical_features = np.array([
    "location",
    "Transaction",
    "Furnishing",
    "facing",
    "Ownership",
    "Bathroom",
    "Balcony",
])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187531 entries, 0 to 187530
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Amount(in rupees)  187531 non-null  object 
 1   Price (in rupees)  169866 non-null  float64
 2   location           187531 non-null  object 
 3   Floor              180454 non-null  object 
 4   Transaction        187448 non-null  object 
 5   Furnishing         184634 non-null  object 
 6   facing             117298 non-null  object 
 7   Bathroom           186703 non-null  object 
 8   Balcony            138596 non-null  object 
 9   Ownership          122014 non-null  object 
dtypes: float64(1), object(9)
memory usage: 14.3+ MB


In [12]:
print(df.shape[0])
df.isnull().sum()
df.isnull().sum() / df.shape[0]

cols_to_fill_unknown = ['facing', 'Ownership','Floor','Transaction','Furnishing']
df[cols_to_fill_unknown] = df[cols_to_fill_unknown].fillna('Unknown')

187531


In [13]:
for feature in continuous_features:
    print(df[feature].value_counts())
    print("-----------------------------------")

Amount(in rupees)
Call for Price    9684
85 Lac            5264
65 Lac            4229
60 Lac            3869
70 Lac            3801
                  ... 
21.60 Cr             1
21.45 Cr             1
9.45 Cr              1
13.6 Lac             1
9.90 Cr              1
Name: count, Length: 1561, dtype: int64
-----------------------------------
Price (in rupees)
4000.0     2463
5000.0     2286
10000.0    2273
3200.0     1479
18000.0    1420
           ... 
8808.0        1
7864.0        1
10707.0       1
10680.0       1
2508.0        1
Name: count, Length: 10958, dtype: int64
-----------------------------------
Floor
2 out of 4      12433
1 out of 4      11985
3 out of 4       8792
1 out of 3       7093
Unknown          7077
                ...  
30 out of 51        1
29 out of 33        1
19 out of 52        1
14 out of 42        1
7 out of 70         1
Name: count, Length: 948, dtype: int64
-----------------------------------


### Amounts

In [14]:
# print(df["Amount(in rupees)"].head())
# print(df["Amount(in rupees)"][2].find('Cr'))

def convert_to_int(value):
    
    # if (type(value) == float): 
    #     return value
    
    if value.find('Lac') != -1:
        num = float(value[:value.find('Lac')]) * 100000 # 1 lac = 100000
    elif value.find('Cr') != -1:
        num = float(value[:value.find('Cr')]) * 10000000 # 1 crore = 10000000
    elif value == "Call for Price":
        num = None
    else:
        num = float(value)  # Convert other numerical values directly

    return num
    
        

df['Amount(in rupees)'] = df['Amount(in rupees)'].apply(convert_to_int).astype(float)


In [15]:
feature = "Amount(in rupees)"
# print(df[feature].head())
# print(type(df[feature]))


df.describe()
df.dropna(subset=[feature], inplace=True)
print(df.shape)
df.info()

(177847, 10)
<class 'pandas.core.frame.DataFrame'>
Index: 177847 entries, 0 to 187530
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Amount(in rupees)  177847 non-null  float64
 1   Price (in rupees)  169866 non-null  float64
 2   location           177847 non-null  object 
 3   Floor              177847 non-null  object 
 4   Transaction        177847 non-null  object 
 5   Furnishing         177847 non-null  object 
 6   facing             177847 non-null  object 
 7   Bathroom           177087 non-null  object 
 8   Balcony            130688 non-null  object 
 9   Ownership          177847 non-null  object 
dtypes: float64(2), object(8)
memory usage: 14.9+ MB


### Price

In [16]:
feature = "Price (in rupees)"
# Calculate the average of the numerical values in the column
# average_numeric = np.nanmean(df[feature])

# Fill NaN values in the column with the calculated average
# df[feature].fillna(average_numeric, inplace=True)
df = df.dropna(subset=['Price (in rupees)'])


# df.info()

### Floor

In [17]:
feature = "Floor"

df[['Floor', 'Total Floors']] = df['Floor'].str.extract(r'(\d+) out of (\d+)').astype(float)
df[feature].info()
df[feature].head(30)

<class 'pandas.core.series.Series'>
Index: 169866 entries, 0 to 187530
Series name: Floor
Non-Null Count   Dtype  
--------------   -----  
153200 non-null  float64
dtypes: float64(1)
memory usage: 2.6 MB


0     10.0
1      3.0
2     10.0
4     20.0
5      2.0
6      4.0
7      NaN
8      NaN
9      3.0
10     6.0
11    16.0
12     8.0
13    18.0
14     2.0
15    10.0
16     5.0
17     4.0
18    20.0
19     3.0
20     3.0
21    15.0
22     2.0
23    27.0
25     5.0
26     3.0
27    11.0
28     8.0
29     9.0
30    16.0
31    14.0
Name: Floor, dtype: float64

In [18]:
for feature in categorical_features:
    print(df[feature].value_counts())
    print("-----------------------------------")

location
new-delhi      24849
bangalore      23168
kolkata        19562
gurgaon        18238
ahmedabad      12178
               ...  
nellore           28
pondicherry       27
palakkad          27
navsari           26
madurai           21
Name: count, Length: 81, dtype: int64
-----------------------------------
Transaction
Resale          129333
New Property     40521
Unknown              6
Other                4
Rent/Lease           2
Name: count, dtype: int64
-----------------------------------
Furnishing
Semi-Furnished    79574
Unfurnished       70303
Furnished         17929
Unknown            2060
Name: count, dtype: int64
-----------------------------------
facing
Unknown         62052
East            49486
North - East    23172
North           15094
West             7745
South            3995
North - West     3769
South - East     2529
South -West      2024
Name: count, dtype: int64
-----------------------------------
Ownership
Freehold                102415
Unknown             

## Binary Encoding

In [19]:
oridinal_categorical_features = np.array([
    "Bathroom",
    "Balcony",
])

ordinal_mapping = {
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8,
    '9': 9,
    '10': 10,
    '> 10': 11
}

# Apply mapping to convert 'Category' column to integers
for feature in oridinal_categorical_features:
    df[feature] = df[feature].map(ordinal_mapping)

In [20]:
df.dropna(subset=['Bathroom'], inplace=True)
df.Bathroom.isnull().sum()
df['Balcony'].fillna(0,inplace=True)
df.Balcony.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Balcony'].fillna(0,inplace=True)


0

## One-hot Encoding

In [21]:
df.isnull().sum() / df.shape[0]

Amount(in rupees)    0.000000
Price (in rupees)    0.000000
location             0.000000
Floor                0.094399
Transaction          0.000000
Furnishing           0.000000
facing               0.000000
Bathroom             0.000000
Balcony              0.000000
Ownership            0.000000
Total Floors         0.094399
dtype: float64

In [22]:
# df['Floor'].interpolate(method='linear', inplace=True)

In [23]:
df.dropna(subset=["Floor"], inplace=True)
df.isnull().sum() / df.shape[0]

Amount(in rupees)    0.0
Price (in rupees)    0.0
location             0.0
Floor                0.0
Transaction          0.0
Furnishing           0.0
facing               0.0
Bathroom             0.0
Balcony              0.0
Ownership            0.0
Total Floors         0.0
dtype: float64

In [24]:
nominal_categorical_features = np.array([
    "location",
    "Transaction",
    "Furnishing",
    "facing",
    "Ownership"
])

for feature in nominal_categorical_features:
    one_hot = pd.get_dummies(df[feature], prefix=f"{feature}")
    df = df.join(one_hot)
    df.drop(f'{feature}', axis=1, inplace=True)

df.columns
# df.info()
    

Index(['Amount(in rupees)', 'Price (in rupees)', 'Floor', 'Bathroom',
       'Balcony', 'Total Floors', 'location_agra', 'location_ahmadnagar',
       'location_ahmedabad', 'location_allahabad',
       ...
       'facing_South', 'facing_South - East', 'facing_South -West',
       'facing_Unknown', 'facing_West', 'Ownership_Co-operative Society',
       'Ownership_Freehold', 'Ownership_Leasehold',
       'Ownership_Power Of Attorney', 'Ownership_Unknown'],
      dtype='object', length=110)

In [25]:
df.info()

df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 153148 entries, 0 to 187530
Columns: 110 entries, Amount(in rupees) to Ownership_Unknown
dtypes: bool(104), float64(6)
memory usage: 23.4 MB


Amount(in rupees)                 0
Price (in rupees)                 0
Floor                             0
Bathroom                          0
Balcony                           0
                                 ..
Ownership_Co-operative Society    0
Ownership_Freehold                0
Ownership_Leasehold               0
Ownership_Power Of Attorney       0
Ownership_Unknown                 0
Length: 110, dtype: int64

In [26]:
df.head(40)

Unnamed: 0,Amount(in rupees),Price (in rupees),Floor,Bathroom,Balcony,Total Floors,location_agra,location_ahmadnagar,location_ahmedabad,location_allahabad,...,facing_South,facing_South - East,facing_South -West,facing_Unknown,facing_West,Ownership_Co-operative Society,Ownership_Freehold,Ownership_Leasehold,Ownership_Power Of Attorney,Ownership_Unknown
0,4200000.0,6000.0,10.0,1.0,2.0,11.0,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
1,9800000.0,13799.0,3.0,2.0,0.0,22.0,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,14000000.0,17500.0,10.0,2.0,0.0,29.0,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,16000000.0,18824.0,20.0,2.0,0.0,42.0,False,False,False,False,...,False,False,False,False,True,True,False,False,False,False
5,4500000.0,6618.0,2.0,1.0,1.0,7.0,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
6,1650000.0,2538.0,4.0,1.0,0.0,5.0,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
9,16000000.0,11150.0,3.0,3.0,1.0,27.0,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
10,14000000.0,12174.0,6.0,2.0,0.0,20.0,False,False,False,False,...,False,False,False,True,False,True,False,False,False,False
11,13600000.0,11674.0,16.0,2.0,0.0,24.0,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True
12,13500000.0,15995.0,8.0,2.0,0.0,20.0,False,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [27]:
df.to_csv("house_prices_preprocessed", index=False)

NameError: name 'file_path' is not defined