**OPEN ENDED LAB **

PROJECT :
 House Price & Category Prediction
Build a model that predicts both the price (continuous) and price category (discrete) of houses
based on features like area, number of rooms, and location.
Dataset: House Prices: Advanced Regression Techniques
• Integration:
o Use Linear Regression for price prediction.
o Use Random Forest or SVM to classify the house as Low, Medium, or High value.
o Include preprocessing (feature scaling, outlier removal) and performance
comparison (R², accuracy).

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, classification_report


In [58]:
from google.colab import files
uploaded = files.upload()


Saving train.csv to train (2).csv


In [59]:
df = pd.read_csv('/content/train.csv')
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Using train.csv because it contains both features and target variable


In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [61]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [62]:
df.drop('Id', axis=1, inplace=True)


In [63]:
df = df.fillna(df.median(numeric_only=True))  # numeric columns
df = df.fillna(df.mode().iloc[0])             # categorical columns


In [64]:
df = pd.get_dummies(df, drop_first=True)


In [65]:
X = df.drop(['SalePrice', 'PriceCategory'], axis=1, errors='ignore')
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)



separating values that are used to predict(x) and the values being predicted (y)

In [66]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


scaling will make all features contribute equally

# LINEAR REGRESSION MODEL



In [67]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

print("R² score:", r2_score(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))


R² score: 0.6556593157981869
RMSE: 51392.65875817199


In [68]:
bins = [0, 130000, 250000, np.inf]
labels = ['Low', 'Medium', 'High']

df['PriceCategory'] = pd.cut(df['SalePrice'], bins=bins, labels=labels)
df['PriceCategory'].value_counts()


Unnamed: 0_level_0,count
PriceCategory,Unnamed: 1_level_1
Medium,867
Low,376
High,217


Converting continuous prices into three categories Low Medium High for classification.

In [69]:
# Ridge Regression will help reduce overfitting
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

ridge = Ridge(alpha=10)
ridge.fit(X_train_scaled, y_train)

y_train_pred_ridge = ridge.predict(X_train_scaled)
y_test_pred_ridge = ridge.predict(X_test_scaled)

train_r2_ridge = r2_score(y_train, y_train_pred_ridge)
test_r2_ridge = r2_score(y_test, y_test_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_test_pred_ridge))

print(f"Ridge Regression Results:")
print(f"Train R²: {train_r2_ridge:.3f}")
print(f"Test  R²: {test_r2_ridge:.3f}")
print(f"RMSE: {rmse_ridge:.2f}")


Ridge Regression Results:
Train R²: 0.930
Test  R²: 0.828
RMSE: 36339.91


In [70]:
for a in [0.1, 1, 5, 10, 20, 50]:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train_scaled, y_train)
    print(f"alpha={a} → Train R²={ridge.score(X_train_scaled, y_train):.3f}, Test R²={ridge.score(X_test_scaled, y_test):.3f}")


alpha=0.1 → Train R²=0.936, Test R²=0.661
alpha=1 → Train R²=0.936, Test R²=0.701
alpha=5 → Train R²=0.933, Test R²=0.788
alpha=10 → Train R²=0.930, Test R²=0.828
alpha=20 → Train R²=0.925, Test R²=0.855
alpha=50 → Train R²=0.918, Test R²=0.871


# RANDOM FOREST CLASSIFIER

In [71]:
X_cls = df.drop(['SalePrice', 'PriceCategory'], axis=1, errors='ignore')
y_cls = df['PriceCategory']

In [72]:
X_train, X_test, y_train, y_test = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42)

In [73]:
rf_tuned = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

rf_tuned.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_tuned.predict(X_test)

In [74]:
train_acc = accuracy_score(y_train, rf_tuned.predict(X_train))
test_acc = accuracy_score(y_test, y_pred_rf)

print(f"Train Accuracy: {train_acc:.3f}")
print(f"Test Accuracy:  {test_acc:.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Train Accuracy: 0.963
Test Accuracy:  0.856

Classification Report:
               precision    recall  f1-score   support

        High       1.00      0.71      0.83        48
         Low       0.84      0.82      0.83        83
      Medium       0.84      0.92      0.88       161

    accuracy                           0.86       292
   macro avg       0.89      0.82      0.84       292
weighted avg       0.86      0.86      0.85       292

