In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Load the dataset
df = pd.read_csv("Housing.csv")

print("--- DataFrame Head ---")
# CORRECTED: Only using index=False for maximum compatibility
print(df.head().to_string(index=False))

print("\n--- DataFrame Info ---")
print(df.info())

--- DataFrame Head ---
   price  area  bedrooms  bathrooms  stories mainroad guestroom basement hotwaterheating airconditioning  parking prefarea furnishingstatus
13300000  7420         4          2        3      yes        no       no              no             yes        2      yes        furnished
12250000  8960         4          4        4      yes        no       no              no             yes        3       no        furnished
12250000  9960         3          2        2      yes        no      yes              no              no        2      yes   semi-furnished
12215000  7500         4          2        2      yes        no      yes              no             yes        3      yes        furnished
11410000  7420         4          1        2      yes       yes      yes              no             yes        2       no        furnished

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Col

In [4]:
# 1. Data Preprocessing: Convert categorical variables to numerical using one-hot encoding
# drop_first=True avoids multicollinearity by removing one category from each feature
df_processed = pd.get_dummies(df, drop_first=True)

# 2. Separate Features (X) and Target (y)
X = df_processed.drop('price', axis=1)
y = df_processed['price']

# Display the shape of the processed feature matrix
print(f"Shape of feature matrix X after preprocessing: {X.shape}")
print(f"Number of features used: {X.shape[1]}")

Shape of feature matrix X after preprocessing: (545, 13)
Number of features used: 13


In [5]:
# 3. Split Data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train Model: Initialize and fit the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("--- Train/Test Split ---")
print(f"Model trained successfully on {X_train.shape[0]} samples.")
print(f"Test set size: {X_test.shape[0]} samples.")
print("\n--- Model Training ---")
print("Linear Regression model (model) is now trained.")
print(f"Model Coefficients (first 5): {model.coef_[:5]}")

--- Train/Test Split ---
Model trained successfully on 436 samples.
Test set size: 109 samples.

--- Model Training ---
Linear Regression model (model) is now trained.
Model Coefficients (first 5): [2.35968805e+02 7.67787016e+04 1.09444479e+06 4.07476595e+05
 2.24841913e+05]


In [6]:
# 5. Evaluate Model: Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = model.score(X_test, y_test)

print("--- Model Performance Metrics ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R^2) Score: {r_squared:.4f}")

--- Model Performance Metrics ---
Mean Squared Error (MSE): 1754318687330.66
Mean Absolute Error (MAE): 970043.40
Root Mean Squared Error (RMSE): 1324506.96
R-squared (R^2) Score: 0.6529
