In [18]:
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import kagglehub
import os
import pandas as pd
import numpy as np
import pickle

In [19]:
def convert_range(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1])) / 2
    try:
        return float(x)
    except:
        return None

In [20]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subf in df.groupby('location'):
        m = np.mean(subf.price_per_sqft)
        st = np.std(subf.price_per_sqft)
        gen_df = subf[(subf.price_per_sqft > (m - st)) & (subf.price_per_sqft <= (m + st))]
        df_output = pd.concat([df_output, gen_df], ignore_index=True)
    return df_output

In [21]:
def bhk_outlier_remover(df):
  exclude_indices = np.array([])
  for location, location_df in df.groupby('location'):
    bhk_stats = {}
    for bhk, bhk_df in location_df.groupby('bhk'):
      bhk_stats[bhk] = {
          'mean': np.mean(bhk_df.price_per_sqft),
          'std' : np.std(bhk_df.price_per_sqft),
          'count': bhk_df.shape[0]
      }
    for bhk, bhk_df in location_df.groupby('bhk'):
      stats = bhk_stats.get(bhk-1)
      if stats and stats['count']>5:
        exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
  return df.drop(exclude_indices, axis='index')

In [22]:
path = kagglehub.dataset_download("amitabhajoy/bengaluru-house-price-data")
files = os.listdir(path)
print("Files in the dataset directory:", files)
file_path = os.path.join(path, files[0])  # Assuming there's only one CSV file
df = pd.read_csv(file_path)
df.drop(columns=['area_type','availability','society','balcony'], inplace=True)
df['location'] = df['location'].fillna('Sarjapur Road')
df['bath'] = df['bath'].fillna(df['bath'].median())
df['bhk'] = df['size'].fillna('0').str.split().str.get(0).astype(int)
df['total_sqft'] = df['total_sqft'].apply(convert_range)
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']
df['location'] = df['location'].apply(lambda x: x.strip())
location_count = df['location'].value_counts()
location_count_less_10 = location_count[location_count<=10]
df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_less_10 else x)
df = df[((df['total_sqft']/df['bhk']) >= 300)]
df = remove_outliers_sqft(df)
df = bhk_outlier_remover(df)
df.drop(columns=['size','price_per_sqft'], inplace=True)
df.to_csv('Cleaned_data.csv')

Files in the dataset directory: ['Bengaluru_House_Data.csv']


In [23]:
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
column_trans = make_column_transformer(
    (OneHotEncoder(sparse_output=False), ['location']),
    remainder='passthrough')
scaler = StandardScaler()
lr = make_pipeline(StandardScaler(), LinearRegression())
pipe = make_pipeline(column_trans, scaler, lr)
pipe.fit(X_train, y_train)
y_pred_lr = pipe.predict(X_test)
Linear_regression_r2_score = r2_score(y_test, y_pred_lr)
# applying lasso
lasso = Lasso()
pipe = make_pipeline(column_trans, scaler, lasso)
pipe.fit(X_train, y_train)
y_pred_lasso = pipe.predict(X_test)
lasso_r2_score = r2_score(y_test, y_pred_lasso)
# applying Ridge Regression
ridge = Ridge()
pipe = make_pipeline(column_trans, scaler, ridge)
pipe.fit(X_train, y_train)
y_pred_ridge = pipe.predict(X_test)
ridge_r2_score = r2_score(y_test, y_pred_ridge)
print(ridge_r2_score)
print(lasso_r2_score)
print(Linear_regression_r2_score)

0.8024032565605995
0.7927494848228667
0.8023814311022412
