In [14]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from google.colab import drive


In [15]:

# Load the dataset
data = pd.read_csv("Housing.csv")

In [16]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [17]:

# Step 1: Null Value Check and Handling
print("\nChecking for missing values:")
print(data.isnull().sum())  # Display number of missing values in each column


Checking for missing values:
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [18]:

# Impute missing values (median for numerical, mode for categorical)
data.fillna(data.median(numeric_only=True), inplace=True)  # Fill numerical columns with median
data.fillna(data.mode().iloc[0], inplace=True)  # Fill categorical columns with mode

In [19]:

print("\nAfter handling missing values:")
print(data.isnull().sum())  # Confirm there are no missing values


After handling missing values:
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [20]:
# Step 2: Transform Categorical Variables to Numerical
categorical_cols = data.select_dtypes(include=['object']).columns  # Identify categorical columns
print(f"\nCategorical Columns: {categorical_cols}")

le = LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])  # Encode categorical variables

print("\nData after encoding categorical values:")
print(data.head())


Categorical Columns: Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')

Data after encoding categorical values:
      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1         

In [21]:
# Step 3: Normalize Numerical Features using Min-Max Normalization
scaler = MinMaxScaler()
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

print("\nData after Min-Max Normalization:")
print(data.head())



Data after Min-Max Normalization:
      price      area  bedrooms  bathrooms   stories  mainroad  guestroom  \
0  1.000000  0.396564       0.6   0.333333  0.666667       1.0        0.0   
1  0.909091  0.502405       0.6   1.000000  1.000000       1.0        0.0   
2  0.909091  0.571134       0.4   0.333333  0.333333       1.0        0.0   
3  0.906061  0.402062       0.6   0.333333  0.333333       1.0        0.0   
4  0.836364  0.396564       0.6   0.000000  0.333333       1.0        1.0   

   basement  hotwaterheating  airconditioning   parking  prefarea  \
0       0.0              0.0              1.0  0.666667       1.0   
1       0.0              0.0              1.0  1.000000       0.0   
2       1.0              0.0              0.0  0.666667       1.0   
3       1.0              0.0              1.0  1.000000       1.0   
4       1.0              0.0              1.0  0.666667       0.0   

   furnishingstatus  
0               0.0  
1               0.0  
2               0.5  

In [22]:
# Step 4: Split Data into Training and Testing Sets
X = data.drop(columns=['price'])  # Features
y = data['price']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTraining and Testing Data Shapes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")



Training and Testing Data Shapes:
X_train: (436, 12), X_test: (109, 12), y_train: (436,), y_test: (109,)


In [23]:
# Step 5: Apply Regression Algorithms
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor (SVR)": SVR(kernel='rbf'),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsRegressor(n_neighbors=5)
}
# Train and evaluate each model
best_model = None
best_r2 = -np.inf

print("\nModel Evaluation Results:")
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on the test set
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{name}:")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")

    # Track the best model based on R2 score
    if r2 > best_r2:
        best_r2 = r2
        best_model = name

print(f"\nBest Model: {best_model} with R2 Score: {best_r2:.4f}")



Model Evaluation Results:

Linear Regression:
Mean Squared Error (MSE): 0.0133
R-squared (R2): 0.6495

Random Forest:
Mean Squared Error (MSE): 0.0147
R-squared (R2): 0.6133

Support Vector Regressor (SVR):
Mean Squared Error (MSE): 0.0132
R-squared (R2): 0.6524

Decision Tree:
Mean Squared Error (MSE): 0.0199
R-squared (R2): 0.4758

K-Nearest Neighbors (KNN):
Mean Squared Error (MSE): 0.0190
R-squared (R2): 0.4991

Best Model: Support Vector Regressor (SVR) with R2 Score: 0.6524


In [24]:
total_price = data['price'].sum()
print(f"\nThe total price from the dataset is: {total_price}")



The total price from the dataset is: 142.3478303030303
