In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('Housing.csv')
print(df.head())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [4]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['mainroad', 'guestroom', 'basement', 
                    'hotwaterheating', 'airconditioning', 
                    'prefarea', 'furnishingstatus']

df_encoded = df.copy()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le
    
print(df_encoded.head())

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus  
0                 0  
1                 0  
2                 1  
3                 0  
4                 0  


In [5]:
x = df_encoded.drop(columns=['price'])
y = df_encoded['price']

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, 
                            test_size=0.2, random_state=42)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

y_pred_sklearn = lin_reg.predict(x_test)

mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
sse_sklearn = ((y_test - y_pred_sklearn) ** 2).sum()
r2_sklearn  = r2_score(y_test, y_pred_sklearn)

print(f"Sklearn Linear Regression:\nMSE: {mse_sklearn}\nSSE: {sse_sklearn}\nR²: {r2_sklearn}")

Sklearn Linear Regression:
MSE: 1771751116594.0344
SSE: 193120871708749.75
R²: 0.6494754192267804


 ## Implement Linear Regression Manually (Normal Equation)

In [10]:
import numpy as np

x_train_bias = np.c_[np.ones((x_train.shape[0], 1)), x_train]
x_test_bias = np.c_[np.ones((x_test.shape[0], 1)), x_test]

In [12]:
theta = np.linalg.inv(x_train_bias.T @ x_train_bias) @ x_train_bias.T @ y_train

In [13]:
y_pred_manual = x_test_bias @ theta

In [14]:
mse_manual = mean_squared_error(y_test, y_pred_manual)
sse_manual = ((y_test - y_pred_manual) ** 2).sum()
r2_manual  = r2_score(y_test, y_pred_manual)

print(f"Manual Linear Regression:\nMSE: {mse_manual}\nSSE: {sse_manual}\nR²: {r2_manual}")

Manual Linear Regression:
MSE: 1771751116594.0317
SSE: 193120871708749.47
R²: 0.649475419226781
