#  House Price Prediction - Optimized

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")


In [3]:

# Load dataset
df = pd.read_csv("C:\AAA\datamites projects\PRCP-1020-HousePricePred\Data\data.csv")




In [5]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:

# Handling missing values (Drop columns with too many NaNs)
df.dropna(axis=1, thresh=len(df) * 0.7, inplace=True)




In [7]:
# Fill missing values for numerical columns with median
def handle_missing_values(df):
    # Fill missing values for numerical columns with median
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    
    # Fill missing values for categorical columns with mode (most frequent value)
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
    
    return df

In [8]:
# Convert categorical variables to numerical using OneHotEncoding
categorical_cols = df.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
categorical_encoded = encoder.fit_transform(df[categorical_cols])
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=encoder.get_feature_names_out(categorical_cols))

In [9]:
# Drop original categorical columns and merge encoded columns
df = df.drop(columns=categorical_cols)
df = pd.concat([df, categorical_encoded_df], axis=1)


In [10]:
# Select top 10 features based on correlation with SalePrice
correlation = df.corr()
top_features = correlation["SalePrice"].abs().sort_values(ascending=False).index[1:11]
df = df[top_features.to_list() + ["SalePrice"]]

In [11]:
top_features

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
       '1stFlrSF', 'ExterQual_TA', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt'],
      dtype='object')

In [12]:
# Separate target variable
y = df["SalePrice"]
X = df.drop(columns=["SalePrice"])


In [13]:
# Identify numerical columns (categorical already encoded)
numeric_cols = X.columns

In [14]:
# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols)
    ]
)

In [15]:
# Apply preprocessing
X_processed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [16]:
# Model Evaluation Function
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    accuracy = 100 - (np.mean(np.abs(y_true - y_pred) / y_true) * 100)
    return mae, rmse, r2, accuracy

# TRAIN BASE MODELS

In [17]:
# RANDOM FOREST
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)


In [18]:
#  GRADIENT BOOSTING
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)


In [19]:
# Predictions
y_pred_rf = rf.predict(X_test)
y_pred_gb = gb.predict(X_test)

# MODEL EVALUATION

In [20]:
# Model Evaluation
models = {"Random Forest": y_pred_rf, "Gradient Boosting": y_pred_gb}
metrics = {name: evaluate_model(y_test, y_pred) for name, y_pred in models.items()}
print(pd.DataFrame(metrics, index=["MAE", "RMSE", "R2", "Accuracy (%)"]).T)

                            MAE          RMSE        R2  Accuracy (%)
Random Forest      18945.806017  29356.303789  0.887646     87.846642
Gradient Boosting  19236.307690  29555.076239  0.886119     87.925524


# Hyperparameter Tuning

In [21]:

rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30, None], 'min_samples_split': [2, 5, 10]}
grid_rf = RandomizedSearchCV(RandomForestRegressor(random_state=42), rf_params, n_iter=10, cv=3, scoring='r2', n_jobs=-1, random_state=42)
grid_rf.fit(X_train, y_train)
rf_best = grid_rf.best_estimator_


In [22]:
gb_params = {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [3, 5, 7]}
grid_gb = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), gb_params, n_iter=10, cv=3, scoring='r2', n_jobs=-1, random_state=42)
grid_gb.fit(X_train, y_train)
gb_best = grid_gb.best_estimator_


In [23]:
# Retrain Best Models
rf_best.fit(X_train, y_train)
gb_best.fit(X_train, y_train)


# Final Predictions & Evaluation

In [24]:

predictions = {"Random Forest": rf_best.predict(X_test), "Gradient Boosting": gb_best.predict(X_test)}
metrics = {name: evaluate_model(y_test, y_pred) for name, y_pred in predictions.items()}
metrics_df = pd.DataFrame(metrics, index=["MAE", "RMSE", "R2", "Accuracy (%)"]).T
print("\nFinal Model Performance:")
print(metrics_df)



Final Model Performance:
                            MAE          RMSE        R2  Accuracy (%)
Random Forest      19066.533947  30500.850584  0.878714     87.922595
Gradient Boosting  19642.205687  30085.302062  0.881996     87.601902
