In [87]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_validate, ShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.utils import shuffle
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_curve, roc_auc_score,log_loss, classification_report,r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [4]:
dataset= pd.read_csv('/content/bengaluru_house_prices.csv')
dataset.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
print("Rows, Columns:", dataset.shape)
print("\nColumns:", dataset.columns.tolist())
print("\nMissing Values:\n", dataset.isnull().sum())

Rows, Columns: (13320, 9)

Columns: ['area_type', 'availability', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony', 'price']

Missing Values:
 area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [10]:
dataset['bath'] = dataset['bath'].fillna(dataset['bath'].mode()[0])
dataset['balcony'] = dataset['balcony'].fillna(dataset['balcony'].mode()[0])
dataset['size'] = dataset['size'].fillna(dataset['size'].mode()[0])
dataset.dropna(subset=['location'], inplace=True)
dataset['BHK'] = dataset['size'].apply(lambda x: int(str(x).split(' ')[0]))

def convert_sqft(x):
    if '-' in str(x):
        nums = list(map(float, x.split('-')))
        return (nums[0] + nums[1]) / 2
    try:
        return float(x)
    except:
        return None

dataset['total_sqft'] = dataset['total_sqft'].apply(convert_sqft)
dataset.dropna(subset=['total_sqft'], inplace=True)

location_counts = dataset['location'].value_counts()
rare_locations = location_counts[location_counts <= 10].index
dataset['location'] = dataset['location'].apply(lambda x: 'other' if x in rare_locations else x)


X = dataset.drop(['price','size'], axis=1)
y = dataset['price']
train_features, test_features,train_labels,test_labels = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

dataset = dataset.copy()

dataset.columns = dataset.columns.str.strip().str.lower().str.replace(" ", "_")

print("Duplicates before:", dataset.duplicated().sum())
dataset = dataset.drop_duplicates()
print("Duplicates after:", dataset.duplicated().sum())

dataset['bath'] = dataset['bath'].fillna(dataset['bath'].median())
dataset['balcony'] = dataset['balcony'].fillna(dataset['balcony'].median())
dataset['size'] = dataset['size'].fillna(dataset['size'].mode()[0])
dataset['location'] = dataset['location'].fillna(dataset['location'].mode()[0])

if 'society' in dataset.columns:
    dataset = dataset.drop(columns=['society'])

scaler = MinMaxScaler()
dataset[['total_sqft', 'price']] = scaler.fit_transform(dataset[['total_sqft', 'price']])

Q1 = dataset['total_sqft'].quantile(0.25)
Q3 = dataset['total_sqft'].quantile(0.75)
IQR = Q3 - Q1
dataset = dataset[(dataset['total_sqft'] >= Q1 - 1.5 * IQR) & (dataset['total_sqft'] <= Q3 + 1.5 * IQR)]

print("Final shape:", dataset.shape)


Duplicates before: 0
Duplicates after: 0
Final shape: (10784, 11)


In [25]:
correlation_matrix = dataset.corr()
price_correlations = correlation_matrix['price'].sort_values(ascending=False)

print(price_correlations)

price                                1.000000
bath                                 0.521794
bhk                                  0.515053
total_sqft                           0.467405
area_type_plot__area                 0.388046
                                       ...   
location_Kengeri                    -0.046974
location_Electronic City Phase II   -0.071921
location_Electronic City            -0.075926
location_Chandapura                 -0.076386
area_type_super_built-up__area      -0.232569
Name: price, Length: 327, dtype: float64


In [28]:
dataset.shape

(10784, 327)

In [37]:
x = dataset[['total_sqft']]
y = dataset['price']
x,y= shuffle(x,y,random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
print(dataset.columns)

Index(['total_sqft', 'bath', 'balcony', 'price', 'bhk',
       'area_type_carpet__area', 'area_type_plot__area',
       'area_type_super_built-up__area', 'availability_14-Nov',
       'availability_15-Aug',
       ...
       'location_Vishveshwarya Layout', 'location_Vishwapriya Layout',
       'location_Vittasandra', 'location_Whitefield',
       'location_Yelachenahalli', 'location_Yelahanka',
       'location_Yelahanka New Town', 'location_Yelenahalli',
       'location_Yeshwanthpur', 'location_other'],
      dtype='object', length=327)


In [54]:
feature_cols = [col for col in dataset.columns if col not in ['price', 'size']]

X = dataset[feature_cols]
y = dataset['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

Mean Squared Error (MSE): 0.00
R-squared (R²): 0.55


In [55]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

print(f"Model Intercept: {model.intercept_:.2f}")
print(f"Model Coefficient (total_sqft): {model.coef_[0]:.2f}")

Mean Squared Error (MSE): 0.00
R-squared (R²): 0.55
Model Intercept: -0.07
Model Coefficient (total_sqft): 0.10


In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
X = dataset[['total_sqft']]
y = dataset['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print("Simple Linear Regression (Single Variable) Results")
print(f"Train R-squared (R²): {r2_train:.2f}")
print(f"Test R-squared (R²): {r2_test:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Simple Linear Regression (Single Variable) Results
Train R-squared (R²): 0.22
Test R-squared (R²): 0.23
Mean Absolute Error (MAE): 0.0259
Mean Squared Error (MSE): 0.0016
Root Mean Squared Error (RMSE): 0.0405


In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

feature_cols = [col for col in dataset.columns if col not in ['price', 'size']]

X = dataset[feature_cols]
y = dataset['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print("Multiple Linear Regression Results")
print(f"Train R-squared (R²): {r2_train:.2f}")
print(f"Test R-squared (R²): {r2_test:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Multiple Linear Regression Results
Train R-squared (R²): 0.56
Test R-squared (R²): 0.55
Mean Absolute Error (MAE): 0.0197
Mean Squared Error (MSE): 0.0010
Root Mean Squared Error (RMSE): 0.0310


In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X = dataset[['total_sqft']]
y = dataset['price']

poly = PolynomialFeatures(degree=2)
X_poly_features = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly_features, y, test_size=0.2, random_state=42)

poly_model = LinearRegression()
poly_model.fit(X_train, y_train)

y_pred_train = poly_model.predict(X_train)
y_pred_test = poly_model.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print("Polynomial Regression Results (Degree 2)")
print(f"Train R-squared (R²): {r2_train:.2f}")
print(f"Test R-squared (R²): {r2_test:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Polynomial Regression Results (Degree 2)
Train R-squared (R²): 0.23
Test R-squared (R²): 0.25
Mean Absolute Error (MAE): 0.0253
Mean Squared Error (MSE): 0.0016
Root Mean Squared Error (RMSE): 0.0401


In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

feature_cols = [col for col in dataset.columns if col not in ['price', 'size']]

X = dataset[feature_cols]
y = dataset['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

coefficients = model.coef_
feature_names = X.columns

coeff_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

coeff_df['Magnitude'] = coeff_df['Coefficient'].abs()
coeff_df = coeff_df.sort_values(by='Magnitude', ascending=False)

print("Top 10 Coefficients by Magnitude:")
print(coeff_df.head(10))

print("\nModel Intercept:", model.intercept_)

Top 10 Coefficients by Magnitude:
                            Feature  Coefficient  Magnitude
175          location_HAL 2nd Stage     0.146316   0.146316
0                        total_sqft     0.104218   0.104218
117  location_Banashankari Stage II     0.100680   0.100680
15              availability_16-Jul     0.092692   0.092692
280           location_Rajaji Nagar     0.092689   0.092689
125            location_Basavangudi     0.086331   0.086331
131            location_Benson Town     0.083078   0.083078
151             location_Cooke Town     0.078602   0.078602
90    location_2nd Stage Nagarbhavi     0.075026   0.075026
198           location_Indira Nagar     0.074141   0.074141

Model Intercept: -0.06651263551820079


In [88]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from scipy import stats


feature_cols = [col for col in dataset.columns if col not in ['price', 'size', 'bhk']]

X = dataset[feature_cols]
y = dataset['price']

y_log = np.log1p(y)

X_train, X_test, y_log_train, y_log_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_log_train)

X_numeric = X.select_dtypes(include=np.number)

vif = pd.DataFrame()
vif['Features'] = X_numeric.columns
vif['VIF'] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]
vif = vif.sort_values(by='VIF', ascending=False)
print("VIF Results:")
print(vif.head(5))

model_fit = sm.OLS(y_log, sm.add_constant(X_numeric)).fit()
residuals = model_fit.resid

dw_stat = durbin_watson(residuals)
print(f"\nDurbin-Watson Statistic: {dw_stat:.2f}")

shapiro_test = stats.shapiro(residuals)
print("Shapiro-Wilk Test (Normality of Residuals):")
print(f"p-value: {shapiro_test.pvalue:.4f}")

bp_test = sm.stats.diagnostic.het_breuschpagan(residuals, model_fit.model.exog)
print("Breusch-Pagan Test (Constant Variance):")
print(f"p-value: {bp_test[1]:.4f}")

VIF Results:
     Features       VIF
0  total_sqft  6.765297
1        bath  5.741831
2     balcony  4.948702

Durbin-Watson Statistic: 1.99
Shapiro-Wilk Test (Normality of Residuals):
p-value: 0.0000
Breusch-Pagan Test (Constant Variance):
p-value: 0.0000


  res = hypotest_fun_out(*samples, **kwds)


In [84]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

X = dataset[['total_sqft']]
y = dataset['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

degrees = range(1, 5)
cv_scores = []
best_r2 = -np.inf
best_degree = 0

for degree in degrees:
    poly_features = PolynomialFeatures(degree=degree)
    X_poly_train = poly_features.fit_transform(X_train)

    model = LinearRegression()

    scores = cross_val_score(model, X_poly_train, y_train, cv=5, scoring='r2')

    mean_score = np.mean(scores)
    cv_scores.append(mean_score)

    if mean_score > best_r2:
        best_r2 = mean_score
        best_degree = degree

print(f"Best Degree: {best_degree}")
print(f"Best CV R²: {best_r2:.2f}")

Best Degree: 2
Best CV R²: 0.23


In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score

ridge_model = Ridge()
ridge_params = {'alpha': np.logspace(-4, 2, 100)}
ridge_grid_search = GridSearchCV(ridge_model, ridge_params, cv=5, scoring='r2')
ridge_grid_search.fit(X_train, y_train)

best_ridge_alpha = ridge_grid_search.best_params_['alpha']
ridge_best_r2 = ridge_grid_search.best_score_
print(f"Best Ridge Alpha: {best_ridge_alpha:.4f}")
print(f"Best Ridge R² (CV): {ridge_best_r2:.4f}")

ridge_test_r2 = r2_score(y_test, ridge_grid_search.best_estimator_.predict(X_test))
print(f"Ridge R² (Test): {ridge_test_r2:.4f}")

lasso_model = Lasso(max_iter=10000)
lasso_params = {'alpha': np.logspace(-4, 2, 100)}
lasso_grid_search = GridSearchCV(lasso_model, lasso_params, cv=5, scoring='r2')
lasso_grid_search.fit(X_train, y_train)

best_lasso_alpha = lasso_grid_search.best_params_['alpha']
lasso_best_r2 = lasso_grid_search.best_score_
lasso_coefs = np.sum(lasso_grid_search.best_estimator_.coef_ != 0)

print(f"Best Lasso Alpha: {best_lasso_alpha:.4f}")
print(f"Best Lasso R² (CV): {lasso_best_r2:.4f}")
print(f"Number of non-zero coefficients: {lasso_coefs}")

lasso_test_r2 = r2_score(y_test, lasso_grid_search.best_estimator_.predict(X_test))
print(f"Lasso R² (Test): {lasso_test_r2:.4f}")

Best Ridge Alpha: 2.3101
Best Ridge R² (CV): 0.2176
Ridge R² (Test): 0.2323
Best Lasso Alpha: 0.0001
Best Lasso R² (CV): 0.2176
Number of non-zero coefficients: 1
Lasso R² (Test): 0.2325


In [93]:
x = dataset[['total_sqft', 'bath', 'balcony']]
y = dataset['price']

poly_features = PolynomialFeatures(degree=2, include_bias=False)
lin_reg = LinearRegression()

pipeline = Pipeline([
    ('poly_features', poly_features),
    ('std_scaler', StandardScaler()),
    ('lin_reg', lin_reg)
])

cv_folds = 5

r2_scores = cross_val_score(pipeline, X, y, scoring='r2', cv=cv_folds)
neg_mse_scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=cv_folds)

r2_mean = np.mean(r2_scores)
r2_std = np.std(r2_scores)

rmse_scores = np.sqrt(-neg_mse_scores)
rmse_mean = np.mean(rmse_scores)
rmse_std = np.std(rmse_scores)

print(f'R² (CV mean): {r2_mean}')
print(f'R² (CV std): {r2_std}')
print(f'RMSE (CV mean): {rmse_mean}')
print(f'RMSE (CV std): {rmse_std}')

R² (CV mean): 0.39471887824318197
R² (CV std): 0.03450025235704536
RMSE (CV mean): 0.03869569664653476
RMSE (CV std): 0.0034725253515515975


In [96]:
X = dataset.drop('price', axis=1)
y = dataset['price']

print("Shape of the new encoded feature set:", X.shape)
print("The first 5 rows of the encoded data:")
print(X.head())

Shape of the new encoded feature set: (10784, 326)
The first 5 rows of the encoded data:
   total_sqft  bath  balcony  bhk  area_type_carpet__area  \
0    0.365340   2.0      1.0    2                   False   
2    0.590164   2.0      3.0    3                   False   
3    0.637588   3.0      1.0    3                   False   
4    0.449649   2.0      1.0    2                   False   
5    0.432084   2.0      1.0    2                   False   

   area_type_plot__area  area_type_super_built-up__area  availability_14-Nov  \
0                 False                            True                False   
2                 False                           False                False   
3                 False                            True                False   
4                 False                            True                False   
5                 False                            True                False   

   availability_15-Aug  availability_15-Dec  ...  \
0          

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_full = LinearRegression()

model_full.fit(X_train, y_train)

y_pred_train = model_full.predict(X_train)
y_pred_test = model_full.predict(X_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mae = mean_squared_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print("Full Model Results")
print(f"Train R-squared (R²): {r2_train:.2f}")
print(f"Test R-squared (R²): {r2_test:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Full Model Results
Train R-squared (R²): 0.56
Test R-squared (R²): 0.55
Mean Absolute Error (MAE): 0.0010
Mean Squared Error (MSE): 0.0010
Root Mean Squared Error (RMSE): 0.0310
