In [43]:
import pandas as pd
import numpy as np

In [44]:
data = pd.read_csv("hyderabad_realestate.csv")

In [45]:
data.head(2)

Unnamed: 0,price,locality,property_type,bedrooms,brand_new,furnished_status,builder,project_name,user_type,floor,date_posted,move_in,area_sqft,price_persqft
0,4927000.0,Adibatla,Apartment,2,New Property,Unfurnished,ELV Projects Private Limited,ELV Cosmopolis,Builder,,2021-08-11,,1285.0,3834.241245
1,7988400.0,Manikonda,Apartment,2,New Property,Unfurnished,AR Infra and Developers,AR The Nest,Builder,,2021-08-17,2021-08-01,1268.0,6300.0


In [46]:
#clearing unwanted rows
coloumns_to_drop = ['builder','project_name','floor','date_posted','move_in','price_persqft']

In [47]:
data = data.drop(columns=coloumns_to_drop)

In [48]:
data.head(1)

Unnamed: 0,price,locality,property_type,bedrooms,brand_new,furnished_status,user_type,area_sqft
0,4927000.0,Adibatla,Apartment,2,New Property,Unfurnished,Builder,1285.0


In [49]:
#checking for null values
data.isnull().sum().sort_values(ascending=False)

locality            746
price               494
furnished_status    485
area_sqft           239
bedrooms            145
brand_new            11
property_type         0
user_type             0
dtype: int64

In [50]:
#cleaning price and area_sqft
data[['price','area_sqft']].isnull().sum

<bound method DataFrame.sum of        price  area_sqft
0      False      False
1      False      False
2      False      False
3      False      False
4      False      False
...      ...        ...
10902  False      False
10903  False      False
10904   True      False
10905   True      False
10906   True       True

[10907 rows x 2 columns]>

In [51]:
data = data.dropna(subset=['price','area_sqft'])

In [52]:
data[['price','area_sqft']].isnull().sum

<bound method DataFrame.sum of        price  area_sqft
0      False      False
1      False      False
2      False      False
3      False      False
4      False      False
...      ...        ...
10899  False      False
10900  False      False
10901  False      False
10902  False      False
10903  False      False

[10374 rows x 2 columns]>

In [53]:
#cleaning bedrooms
data['bedrooms'].mode()[0]

'2'

In [54]:
#filling missing bedrooms values with the most common value(mode = 2)

data['bedrooms'] = data['bedrooms'].fillna("2")

#covert bedrooms column to int
data['bedrooms'] = data['bedrooms'].astype(int)

In [55]:
data.bedrooms.isnull().value_counts()

bedrooms
False    10374
Name: count, dtype: int64

In [56]:
#cleaning furnishing_status

data['furnished_status'].value_counts()

furnished_status
Unfurnished       5490
Semi-Furnished    3157
Furnished         1672
Name: count, dtype: int64

In [57]:
#checking most frequent value
data['furnished_status'].mode()[0]

'Unfurnished'

In [58]:
#filling missing 'furnished_status' with the most frequent value: 'Unfurnished'
data['furnished_status'] = data['furnished_status'].fillna("Unfurnished")

In [59]:
#checking the column nullvalues
data['furnished_status'].isnull().sum()

np.int64(0)

In [60]:
#cleaning locality
data['locality'].isnull().sum()

np.int64(680)

In [61]:
#filling missing 'locality' values with 'Other
data['locality'] = data['locality'].fillna("Other")

In [62]:
data['locality'].isnull().sum()

np.int64(0)

In [63]:
#cleaning brand new column
data['brand_new'].unique()

array(['New Property', 'Resale', 'Rent'], dtype=object)

In [64]:
data['brand_new'].value_counts()

brand_new
Resale          9093
New Property    1280
Rent               1
Name: count, dtype: int64

In [65]:
#filling missing column with most comman value
data['brand_new'] = data['brand_new'].fillna('Resale')

In [66]:
data['brand_new'].isnull().sum()

np.int64(0)

In [67]:
#add price_per_sqft feature
data = data[data['area_sqft'] > 0]  # remove rows with area = 0
data['price_per_sqft'] = data['price'] / data['area_sqft']

In [68]:

#Remove outliers in price_per_sqft (1st and 99th percentiles)
q1 = data['price_per_sqft'].quantile(0.01)
q99 = data['price_per_sqft'].quantile(0.99)
data = data[(data['price_per_sqft'] >= q1) & (data['price_per_sqft'] <= q99)]

In [69]:
data.rename(columns={"bedrooms": "bhk"}, inplace=True)
data

Unnamed: 0,price,locality,property_type,bhk,brand_new,furnished_status,user_type,area_sqft,price_per_sqft
0,4927000.0,Adibatla,Apartment,2,New Property,Unfurnished,Builder,1285.0,3834.241245
1,7988400.0,Manikonda,Apartment,2,New Property,Unfurnished,Builder,1268.0,6300.000000
2,8625000.0,Hitech City,Apartment,2,New Property,Unfurnished,Builder,1150.0,7500.000000
3,4085000.0,Aminpur,Apartment,2,New Property,Unfurnished,Builder,918.0,4449.891068
4,13549250.0,Shaikpet,Apartment,3,New Property,Unfurnished,Builder,1895.0,7150.000000
...,...,...,...,...,...,...,...,...,...
10899,2200000.0,New Nallakunta,Studio Apartment,2,Resale,Furnished,Owner,300.0,7333.333333
10900,1800000.0,Shameerpet,Studio Apartment,2,Resale,Semi-Furnished,Owner,715.0,2517.482517
10901,1800000.0,Appa junction,Studio Apartment,2,Resale,Furnished,Owner,262.0,6870.229008
10902,2000000.0,Kukatpally,Studio Apartment,2,Resale,Unfurnished,Owner,276.0,7246.376812


In [70]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score

In [71]:
#finding unique
data['locality'].nunique()

1048

In [72]:
#find top 50 places
top_50_localities = data['locality'].value_counts().nlargest(50).index

#replace other localities with other
data['locality'] = data['locality'].apply(lambda x: x if x in top_50_localities else "Other")

In [73]:
data['locality'].value_counts().nunique()

42

In [74]:
data = data[data['price_per_sqft'] >= 8000]

In [75]:

#Add locality tier feature
premium_areas = [
    "Banjara Hills", "Jubilee Hills", "Gachibowli", "Kondapur", 
    "Madhapur", "HiTech City", "Financial District", "Kokapet"
]
data["locality_tier"] = data["locality"].apply(
    lambda x: "A" if x in premium_areas else ("B" if x != "Other" else "C")
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["locality_tier"] = data["locality"].apply(


In [76]:

#Add locality average price and comparison
locality_avg = data.groupby("locality")["price_per_sqft"].mean().to_dict()
data["locality_avg_pps"] = data["locality"].map(locality_avg)
data["pps_above_locality_avg"] = (data["price_per_sqft"] > data["locality_avg_pps"]).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["locality_avg_pps"] = data["locality"].map(locality_avg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["pps_above_locality_avg"] = (data["price_per_sqft"] > data["locality_avg_pps"]).astype(int)


In [77]:
data['price_segment'] = pd.cut(
    data['price_per_sqft'],
    bins=[0, 8000, 15000, 25000, 50000],
    labels=['budget', 'mid', 'luxury', 'ultra']
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price_segment'] = pd.cut(


In [78]:

#upsample luxury and ultra properties
luxury_data = data[data['price_segment'].isin(['luxury', 'ultra'])]
data = pd.concat([data, luxury_data]*3, ignore_index=True)

In [79]:
#One-Hot Encoding of Categorical Columns

categorical_cols = ['locality', 'property_type', 'furnished_status', 'brand_new', 'user_type','price_segment','locality_tier']

data_encoded = pd.get_dummies(data,columns=categorical_cols,drop_first=True)

In [80]:
#seperate features and target variable
X = data_encoded.drop(['price', 'price_per_sqft'], axis=1)
y = data_encoded['price']
X['room_density'] = X['area_sqft'] / (X['bhk'] + 1)
X['bhk_to_area'] = X['area_sqft'] / (X['bhk'] + 1)
X['is_luxury'] = ((X['bhk'] >= 4) & (X['area_sqft'] >= 1800)).astype(int)
X['is_compact'] = (X['area_sqft'] <= 800).astype(int)



#save column names used for the model
import pickle

columns =X.columns.to_list()
with open("columns.pkl",'wb') as f:
    pickle.dump(X.columns.to_list(),f)

#spliting into 80% train and 20% test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [81]:
#train the model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=2,
    random_state=42
)

model.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [82]:
#predict the dataset
y_predict = model.predict(X_test)

#calculate R² Score and RMSE
r2 = r2_score(y_test,y_predict)

#MSE (squared=True by default)
mse = mean_squared_error(y_test, y_predict)

#RMSE = sqrt(MSE)
rmse = np.sqrt(mse)


In [83]:
print("R² Score:", r2)
print("Test RMSE:", rmse)

R² Score: 0.9545487100278532
Test RMSE: 768355.538416822


In [84]:
#save the trained Random Forest model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f,protocol=4)