Objective: Predict house prices based on the house size (in square feet) using linear regression.

In [1]:
import pandas as pd 

# loading the dataset 
df=pd.read_csv("C:/Users/Prasad/Downloads/house_price_single_feature.csv")
df

Unnamed: 0,Size_sqft,Price
0,1360,192433
1,1794,260116
2,1630,248397
3,1595,221862
4,2138,324183
5,2669,397509
6,966,139441
7,1738,276620
8,830,131031
9,1982,278885


In [7]:
# data cleaning 
df.isnull().sum()

df.duplicated().sum()
# if any duplicates are present just drop 
df=df.drop_duplicates()

In [8]:
# spiltting the dataset 
from sklearn.model_selection import train_test_split

X = df[['Size_sqft']]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# approch 1 without feature scalling 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred = lr.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


MAE: 11514.675611422183
MSE: 149229257.46278942
RMSE: 12215.942757838604
R2 Score: 0.9887913482434763


In [None]:
# approch 2
# using pipeline to check which is which best model 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# pipeline for polynomial regression 
pipeline=Pipeline([('poly',PolynomialFeatures()),
                    ('lr',LinearRegression())])

params={
    'poly__degree':[1,2,3,4]
}
grid=GridSearchCV(pipeline,params)
grid.fit(X_train,y_train)

print('best degree',grid.best_params_)
best_model=grid.best_estimator_
y_pred_best=best_model.predict(X_test)

print('r2 score after tuning',r2_score(y_test,y_pred_best))

best degree {'poly__degree': 1}
r2 score after tuning 0.9887913482434763


In [None]:
# approch 3 using feature scalling 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[['Size_sqft']])


In [19]:
from sklearn.model_selection import train_test_split

y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred = lr.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


MAE: 11514.675611422188
MSE: 149229257.4627895
RMSE: 12215.942757838608
R2 Score: 0.9887913482434763
