In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('bengaluru_house_prices.csv')

In [4]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
df.shape

(13320, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [7]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:

df = df.drop(['area_type', 'availability', 'society'], axis=1, errors='ignore')
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   balcony     12711 non-null  float64
 5   price       13320 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


In [10]:
df = df.dropna()
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [11]:
df.shape

(12710, 6)

In [12]:
df['bhk'] = df['size'].apply(lambda x: int(x.split()[0]))
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [13]:
def convert_sqft(x):
    try:
        x = str(x)
        if "-" in x:
            a, b = x.split("-")
            return (float(a) + float(b)) / 2
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna(subset=['total_sqft'])


In [14]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2


In [15]:
df['price_per_sqft'] = (df['price']*100000) / df['total_sqft']

In [16]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


In [17]:
df['location'] = df['location'].str.strip()
location_counts = df['location'].value_counts()
rare_locations = location_counts[location_counts <= 10].index
df['location'] = df['location'].apply(lambda x: 'other' if x in rare_locations else x)


In [18]:
df['location'].unique

<bound method Series.unique of 0        Electronic City Phase II
1                Chikka Tirupathi
2                     Uttarahalli
3              Lingadheeranahalli
4                        Kothanur
                   ...           
13314           Green Glen Layout
13315                  Whitefield
13317       Raja Rajeshwari Nagar
13318             Padmanabhanagar
13319                Doddathoguru
Name: location, Length: 12668, dtype: object>

In [19]:
#removing outliers
df = df[df['total_sqft'] / df['bhk'] >= 300]
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2,4250.0


In [20]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        q1 = subdf['price_per_sqft'].quantile(0.25)
        q3 = subdf['price_per_sqft'].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        filtered = subdf[(subdf['price_per_sqft'] >= lower) & (subdf['price_per_sqft'] <= upper)]
        df_out = pd.concat([df_out, filtered], ignore_index=True)
    return df_out

df = remove_pps_outliers(df)

In [24]:
X = df[['location', 'total_sqft', 'bath', 'balcony', 'bhk']]
y = df['price']


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)


(8940, 5) (2235, 5)


In [26]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# Categorical and numeric columns
categorical_cols = ['location']
numeric_cols = ['total_sqft', 'bath', 'balcony', 'bhk']

# Preprocessor: OneHotEncode for location
preprocessor = ColumnTransformer(
    transformers=[
        ('location_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num_features', 'passthrough', numeric_cols)
    ]
)

# Model
model = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('linear_model', LinearRegression())
])

# Train the model
model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessing', ...), ('linear_model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('location_encoder', ...), ('num_features', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [27]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)


RMSE: 60.45996941132958
R² Score: 0.38359330516665535


In [28]:
# cross validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


Cross-validation scores: [ 0.39090171  0.58623445  0.40236077  0.56995485 -0.0278499 ]
Mean CV Score: 0.3843203744931834


In [29]:
#save the model
import pickle

with open("bangalore_house_price_model.pkl", "wb") as f:
    pickle.dump(model, f)


In [36]:
sample = pd.DataFrame({
    'location': ['Whitefield'],
    'total_sqft': [1200],
    'bath': [2],
    'balcony': [1],
    'bhk': [2]
})

final_price_pred_inlakhs = model.predict(sample)
print('final_price_pred_inlakhs =',final_price_pred_inlakhs)


final_price_pred_inlakhs = [64.32380993]
