In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("properties_data.csv")

In [None]:
df.shape

(1905, 38)

In [None]:
df.columns

Index(['id', 'neighborhood', 'latitude', 'longitude', 'price', 'size_in_sqft',
       'price_per_sqft', 'no_of_bedrooms', 'no_of_bathrooms', 'quality',
       'maid_room', 'unfurnished', 'balcony', 'barbecue_area',
       'built_in_wardrobes', 'central_ac', 'childrens_play_area',
       'childrens_pool', 'concierge', 'covered_parking', 'kitchen_appliances',
       'lobby_in_building', 'maid_service', 'networked', 'pets_allowed',
       'private_garden', 'private_gym', 'private_jacuzzi', 'private_pool',
       'security', 'shared_gym', 'shared_pool', 'shared_spa', 'study',
       'vastu_compliant', 'view_of_landmark', 'view_of_water',
       'walk_in_closet'],
      dtype='object')

In [None]:
df.drop('id', axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,maid_room,...,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet
0,Palm Jumeirah,25.113208,55.138932,2700000,1079,2502.32,1,2,Medium,False,...,False,False,True,False,False,False,False,False,True,False
1,Palm Jumeirah,25.106809,55.151201,2850000,1582,1801.52,2,2,Medium,False,...,False,False,True,True,False,False,False,False,True,False
2,Jumeirah Lake Towers,25.063302,55.137728,1150000,1951,589.44,3,5,Medium,True,...,False,True,True,True,False,False,False,True,True,True
3,Culture Village,25.227295,55.341761,2850000,2020,1410.89,2,3,Low,False,...,False,False,False,False,False,False,False,False,False,False
4,Palm Jumeirah,25.114275,55.139764,1729200,507,3410.65,0,1,Medium,False,...,False,True,True,True,True,False,False,True,True,False


In [None]:
df.isnull().sum()

Unnamed: 0,0
neighborhood,0
latitude,0
longitude,0
price,0
size_in_sqft,0
price_per_sqft,0
no_of_bedrooms,0
no_of_bathrooms,0
quality,0
maid_room,0


In [None]:
df.duplicated().sum()

12

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1893 entries, 0 to 1904
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   neighborhood         1893 non-null   object 
 1   latitude             1893 non-null   float64
 2   longitude            1893 non-null   float64
 3   price                1893 non-null   int64  
 4   size_in_sqft         1893 non-null   int64  
 5   price_per_sqft       1893 non-null   float64
 6   no_of_bedrooms       1893 non-null   int64  
 7   no_of_bathrooms      1893 non-null   int64  
 8   quality              1893 non-null   object 
 9   maid_room            1893 non-null   bool   
 10  unfurnished          1893 non-null   bool   
 11  balcony              1893 non-null   bool   
 12  barbecue_area        1893 non-null   bool   
 13  built_in_wardrobes   1893 non-null   bool   
 14  central_ac           1893 non-null   bool   
 15  childrens_play_area  1893 non-null   bool  

In [None]:
df['neighborhood'].unique()

array(['Palm Jumeirah', 'Jumeirah Lake Towers', 'Culture Village',
       'Downtown Dubai', 'Dubai Marina', 'Business Bay', 'Old Town',
       'Al Kifaf', 'Meydan', 'Arjan', 'Jumeirah Beach Residence',
       'Dubai Creek Harbour (The Lagoons)', 'Greens', 'City Walk',
       'Al Furjan', 'DAMAC Hills', 'Jumeirah Golf Estates', 'Jumeirah',
       'Dubai Hills Estate', 'Umm Suqeim', 'Motor City', 'DIFC',
       'Jumeirah Village Circle', 'Barsha Heights (Tecom)', 'Al Barari',
       'Dubai Production City (IMPZ)', 'The Hills', 'The Views',
       'Dubai Sports City', 'Dubai Silicon Oasis',
       'Jumeirah Village Triangle', 'Mohammed Bin Rashid City',
       'Dubai Harbour', 'Bluewaters', 'International City',
       'Falcon City of Wonders', 'Mina Rashid', 'Town Square',
       'Green Community', 'Al Barsha', 'Al Sufouh', 'Dubai Festival City',
       'Jebel Ali', 'Dubai Land', 'World Trade Center', 'Mudon',
       'Discovery Gardens', 'Remraam', 'Mirdif',
       'Dubai South (Dubai Wo

In [None]:
df['quality'].unique()

array(['Medium', 'Low', 'High', 'Ultra'], dtype=object)

In [None]:
bool_col = [col for col in df.columns if df[col].dtype == 'bool']
bool_col

['maid_room',
 'unfurnished',
 'balcony',
 'barbecue_area',
 'built_in_wardrobes',
 'central_ac',
 'childrens_play_area',
 'childrens_pool',
 'concierge',
 'covered_parking',
 'kitchen_appliances',
 'lobby_in_building',
 'maid_service',
 'networked',
 'pets_allowed',
 'private_garden',
 'private_gym',
 'private_jacuzzi',
 'private_pool',
 'security',
 'shared_gym',
 'shared_pool',
 'shared_spa',
 'study',
 'vastu_compliant',
 'view_of_landmark',
 'view_of_water',
 'walk_in_closet']

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in bool_col:
    df[col] = le.fit_transform(df[col])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1893 entries, 0 to 1904
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   neighborhood         1893 non-null   object 
 1   latitude             1893 non-null   float64
 2   longitude            1893 non-null   float64
 3   price                1893 non-null   int64  
 4   size_in_sqft         1893 non-null   int64  
 5   price_per_sqft       1893 non-null   float64
 6   no_of_bedrooms       1893 non-null   int64  
 7   no_of_bathrooms      1893 non-null   int64  
 8   quality              1893 non-null   object 
 9   maid_room            1893 non-null   int64  
 10  unfurnished          1893 non-null   int64  
 11  balcony              1893 non-null   int64  
 12  barbecue_area        1893 non-null   int64  
 13  built_in_wardrobes   1893 non-null   int64  
 14  central_ac           1893 non-null   int64  
 15  childrens_play_area  1893 non-null   int64 

In [None]:
df['neighborhood'] = le.fit_transform(df['neighborhood'])

In [None]:
df.head()

Unnamed: 0,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,maid_room,...,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet
0,46,25.113208,55.138932,2700000,1079,2502.32,1,2,Medium,0,...,0,0,1,0,0,0,0,0,1,0
1,46,25.106809,55.151201,2850000,1582,1801.52,2,2,Medium,0,...,0,0,1,1,0,0,0,0,1,0
2,36,25.063302,55.137728,1150000,1951,589.44,3,5,Medium,1,...,0,1,1,1,0,0,0,1,1,1
3,11,25.227295,55.341761,2850000,2020,1410.89,2,3,Low,0,...,0,0,0,0,0,0,0,0,0,0
4,46,25.114275,55.139764,1729200,507,3410.65,0,1,Medium,0,...,0,1,1,1,1,0,0,1,1,0


In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[['Low', 'Medium','High', 'Ultra']])
df['quality'] = oe.fit_transform(df[['quality']])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1893 entries, 0 to 1904
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   neighborhood         1893 non-null   int64  
 1   latitude             1893 non-null   float64
 2   longitude            1893 non-null   float64
 3   price                1893 non-null   int64  
 4   size_in_sqft         1893 non-null   int64  
 5   price_per_sqft       1893 non-null   float64
 6   no_of_bedrooms       1893 non-null   int64  
 7   no_of_bathrooms      1893 non-null   int64  
 8   quality              1893 non-null   float64
 9   maid_room            1893 non-null   int64  
 10  unfurnished          1893 non-null   int64  
 11  balcony              1893 non-null   int64  
 12  barbecue_area        1893 non-null   int64  
 13  built_in_wardrobes   1893 non-null   int64  
 14  central_ac           1893 non-null   int64  
 15  childrens_play_area  1893 non-null   int64 

In [None]:
df.drop('price', axis=1, inplace=True)

In [None]:
X, y = df.drop('price_per_sqft', axis=1), df['price_per_sqft']

In [None]:
X

Unnamed: 0,neighborhood,latitude,longitude,size_in_sqft,no_of_bedrooms,no_of_bathrooms,quality,maid_room,unfurnished,balcony,...,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet
0,46,25.113208,55.138932,1079,1,2,1.0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
1,46,25.106809,55.151201,1582,2,2,1.0,0,0,1,...,0,0,1,1,0,0,0,0,1,0
2,36,25.063302,55.137728,1951,3,5,1.0,1,1,1,...,0,1,1,1,0,0,0,1,1,1
3,11,25.227295,55.341761,2020,2,3,0.0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,46,25.114275,55.139764,507,0,1,1.0,0,0,0,...,0,1,1,1,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1900,42,25.176892,55.310712,1087,2,2,3.0,0,1,1,...,0,1,1,1,1,1,1,1,1,1
1901,42,25.166145,55.276684,760,1,2,1.0,0,0,1,...,0,0,1,1,0,0,0,0,1,1
1902,16,25.206500,55.345056,1930,3,5,1.0,1,1,1,...,0,0,0,1,0,0,0,0,0,0
1903,37,25.073858,55.229844,740,1,2,1.0,0,1,1,...,0,1,1,1,0,0,0,0,1,1


In [None]:
y

Unnamed: 0,price_per_sqft
0,2502.32
1,1801.52
2,589.44
3,1410.89
4,3410.65
...,...
1900,1379.94
1901,1618.42
1902,1502.59
1903,912.16


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.6750777018909053


In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 50],
}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
grid.best_params_

{'max_depth': 30, 'n_estimators': 300}

In [None]:
grid.best_score_

0.6933081125414302

In [None]:
y_pred_grid = grid.predict(X_test)

In [None]:
print(r2_score(y_test, y_pred_grid))

0.6699110437602156
