In [None]:
#import library
import pandas as pd
import numpy as np
import sklearn

In [None]:
print("numpy", np.__version__)
print("pandas", pd.__version__)
print("sklearn",sklearn.__version__)

numpy 2.0.2
pandas 2.2.2
sklearn 1.6.1


In [None]:
df = pd.read_csv("/content/Bengaluru_House_Data.csv")
df.shape

(13320, 9)

In [None]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


**Data Preprocessing**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [None]:
for column in df.columns:
  print(df[column].value_counts())
  print("*"* 25)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
*************************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
16-Oct               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
*************************
location
Whitefield                         540
Sarjapur  Road                     399
Electronic City                    302
Kanakpura Road                     273
Thanisandra                        234
                                  ... 
3rd Stage Raja Rajeshwari Nagar      1
Chuchangatta Colony                  1
Electronic City Phase 1,             1
Chikbasavanapura                     1
Abshot Layout                        1
Name: count, Length: 1305, dtype: int64
**************

In [None]:
# Check null values
df.isnull().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [None]:
#Drop unnecessary column
df.drop(columns = ["area_type","balcony","society","availability"], inplace = True)

In [None]:
# Understand numerical Column
df.describe()

Unnamed: 0,bath,price
count,13247.0,13320.0
mean,2.69261,112.565627
std,1.341458,148.971674
min,1.0,8.0
25%,2.0,50.0
50%,2.0,72.0
75%,3.0,120.0
max,40.0,3600.0


In [None]:
# Fill null values
df["location"] = df["location"].fillna("Whitefield")
df["size"]= df["size"].fillna("2 BHK")
df["bath"]= df["bath"].fillna(df["bath"].median())

In [None]:
df["bhk"] = df["size"].str.split().str.get(0).astype(int)

In [None]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [None]:
df["total_sqft"].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [None]:
# change total sqrt colummn to integer value
def convert(x):
    temp = str(x).split("-")
    if len(temp) == 2:
      return (float(temp[0]) + float(temp[1]))/2
    try:
      return float(x)
    except:
      return None


df["total_sqft"] = df["total_sqft"].apply(convert)


In [None]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [None]:
# change unit of price column
df["price_sqrt"] = df["price"] * 100000 / df["total_sqft"]

In [None]:
df["location"]

Unnamed: 0,location
0,Electronic City Phase II
1,Chikka Tirupathi
2,Uttarahalli
3,Lingadheeranahalli
4,Kothanur
...,...
13315,Whitefield
13316,Richards Town
13317,Raja Rajeshwari Nagar
13318,Padmanabhanagar


In [None]:
# As there are so manny categorical value in location, so when we apply one-hot encoding then so manny unnecessary column create, so we less this value.
df["location"] = df["location"].apply(lambda x: x.strip())
location_count = df["location"].value_counts()
location_count

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Whitefield,542
Sarjapur Road,399
Electronic City,304
Kanakpura Road,273
Thanisandra,237
...,...
Duddanahalli,1
Doddanakunte,1
Jogupalya,1
Subhash Nagar,1


In [None]:
location_count_less_ten = location_count[location_count <= 10]
location_count_less_ten

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
Basapura,10
Dairy Circle,10
Nagappa Reddy Layout,10
Naganathapura,10
Sector 1 HSR Layout,10
...,...
Duddanahalli,1
Doddanakunte,1
Jogupalya,1
Subhash Nagar,1


In [None]:
df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_less_ten else x)

In [None]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_sqrt
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [None]:
df.drop(columns = ["size"], inplace = True)

In [None]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_sqrt
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


**Outlier Detection**

In [None]:
# Remove those columns whose total_sqrt/ bhk < 300.
df = df[(df['total_sqft']/ df['bhk']) >= 300]
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_sqrt
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [None]:
df.shape

(12530, 6)

In [None]:
# price_sqrt has outlier, to remove it using z-score
def remove_outlier_sqrt(data):
  data_output = pd.DataFrame()
  for key,subdf in data.groupby("location"):
    m= np.mean(subdf.price_sqrt)
    std= np.std(subdf.price_sqrt)
    gendf = subdf[(subdf.price_sqrt > m-std) & (subdf.price_sqrt < m+std)]
    data_output = pd.concat([data_output, gendf], ignore_index= True)
  return data_output
df = remove_outlier_sqrt(df)
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_sqrt
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [None]:
df.shape

(10301, 6)

In [None]:
df.drop(columns = ["price_sqrt"], inplace = True)

In [None]:
df.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [None]:
# Transform my clean data to csv format.
df.to_csv("clean_data.csv")

In [None]:
x = df.drop(columns = ["price"])
y = df["price"]

In [None]:
x

Unnamed: 0,location,total_sqft,bath,bhk
0,1st Block Jayanagar,2850.0,4.0,4
1,1st Block Jayanagar,1630.0,3.0,3
2,1st Block Jayanagar,1875.0,2.0,3
3,1st Block Jayanagar,1200.0,2.0,3
4,1st Block Jayanagar,1235.0,2.0,2
...,...,...,...,...
10296,other,1353.0,2.0,2
10297,other,812.0,1.0,1
10298,other,1440.0,2.0,3
10299,other,1075.0,2.0,2


In [None]:
# import necessary library to run model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
# train test split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 0)

In [None]:
# Transfar categorical variable to numerical data
column_trans = make_column_transformer((OneHotEncoder(), ["location"]), remainder = "passthrough")

In [None]:
# Apply linear regression
pipe = make_pipeline(column_trans, StandardScaler(with_mean=False),LinearRegression())
pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred_lr = pipe.predict(x_test)
print("MSE is:", mean_squared_error(y_test, y_pred_lr))
print("R2-score is:" ,r2_score(y_test, y_pred_lr))

MSE is: 2116.8108911263084
R2-score is: 0.8294581240335491


**Apply Lasso**

In [None]:
pipe = make_pipeline(column_trans, StandardScaler(with_mean=False), Lasso())
pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred_lasso = pipe.predict(x_test)
print("MSE is:", mean_squared_error(y_test, y_pred_lasso))
print("R2-score is:" ,r2_score(y_test, y_pred_lasso))


MSE is: 2206.752082468484
R2-score is: 0.8222119691869105


**Ridge_Regression**

In [None]:
pipe = make_pipeline(column_trans,StandardScaler(with_mean=False), Ridge())
pipe.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
y_pred_ridge = pipe.predict(x_test)
print("MSE is:", mean_squared_error(y_test, y_pred_ridge))
print("R2-score is:" ,r2_score(y_test, y_pred_ridge))

MSE is: 2116.8374942208043
R2-score is: 0.8294559807425915


**Apply Neural Network**

In [None]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [None]:
x_train_processed = column_trans.fit_transform(x_train)
x_test_processed = column_trans.transform(x_test)
scaler = StandardScaler(with_mean=False)
x_train_scaled = scaler.fit_transform(x_train_processed)
x_test_scaled = scaler.transform(x_test_processed)

In [None]:
model = Sequential()
model.add(Dense(128, activation = "relu", input_dim = x_train_scaled.shape[1]))
model.add(Dense(64, activation = "relu"))
model.add(Dense(32, activation = "relu"))
model.add(Dense(1, activation = "linear"))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.summary()

In [None]:
model.compile(optimizer = "adam", loss = "mean_squared_error", metrics = ["mean_squared_error"])

In [None]:
model.fit(x_train_scaled, y_train, epochs=25)

Epoch 1/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 8212.7568 - mean_squared_error: 8212.7568
Epoch 2/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 2209.6460 - mean_squared_error: 2209.6460
Epoch 3/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1261.7194 - mean_squared_error: 1261.7194
Epoch 4/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1171.6971 - mean_squared_error: 1171.6971
Epoch 5/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1115.5668 - mean_squared_error: 1115.5668
Epoch 6/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1192.6885 - mean_squared_error: 1192.6885
Epoch 7/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1291.5103 - mean_squared_error: 1291.5103
Epoch 8/25
[1m258/258[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7acc62c3f910>

In [None]:
y_pred_nn = model.predict(x_test_scaled)

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:
print("MSE is:", mean_squared_error(y_test, y_pred_nn))
print("R2-score is:" ,r2_score(y_test, y_pred_nn))

MSE is: 1883.1218970236057
R2-score is: 0.8482853889602627


In [None]:
import pickle

with open("model.pkl","wb") as file:
  pickle.dump(pipe, file)