# Enhanced House Price Prediction Analysis

This notebook analyzes a housing dataset using multiple advanced models:
1. Linear Regression
2. Random Forest
3. XGBoost
4. CatBoost
5. LightGBM

## 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import (RandomForestRegressor, 
                             StackingRegressor, 
                             VotingRegressor)
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

%matplotlib inline

## 2. Load and Explore Data

In [None]:
# Load the dataset
df = pd.read_csv('Housing.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
display(df.head())

# List numerical and categorical columns
numerical_columns = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']
categorical_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                      'airconditioning', 'prefarea', 'furnishingstatus']

print("\nNumerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

## 3. Data Visualization

In [None]:
# Numerical features distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_columns):
    sns.histplot(data=df, x=col, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap for numerical features
plt.figure(figsize=(10, 8))
sns.heatmap(df[numerical_columns].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Categorical features analysis
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.ravel()

for idx, col in enumerate(categorical_columns):
    sns.boxplot(data=df, x=col, y='price', ax=axes[idx])
    axes[idx].set_title(f'Price by {col}')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Separate features and target
X = df_encoded.drop(['price'], axis=1)
y = df_encoded['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)
print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

## 5. Model Training and Evaluation

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name} Results:")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2 Score: {r2:.4f}")
    
    # Plot actual vs predicted
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.title(f'{model_name} - Actual vs Predicted Prices')
    plt.show()
    
    return model, rmse, r2

In [None]:
# Linear Regression
lr_model, lr_rmse, lr_r2 = evaluate_model(
    LinearRegression(),
    X_train_scaled, X_test_scaled,
    y_train, y_test,
    "Linear Regression"
)

In [None]:
# Random Forest
rf_model, rf_rmse, rf_r2 = evaluate_model(
    RandomForestRegressor(n_estimators=100, random_state=42),
    X_train_scaled, X_test_scaled,
    y_train, y_test,
    "Random Forest"
)

## 6. Feature Importance Analysis

In [None]:
# Plot feature importance for Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Most Important Features (Random Forest)')
plt.tight_layout()
plt.show()

## 5. Enhanced Model Training and Evaluation

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    print(f"\n{model_name} Results:")
    print(f"R2 Score: {r2:.4f}")
    # print(f"RMSE: {rmse:,.2f}")
    # print(f"MAE: {mae:,.2f}")
    print(f"Cross-validation R2 Scores: {cv_scores}")
    print(f"Average CV R2 Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # Plot actual vs predicted
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.title(f'{model_name} - Actual vs Predicted Prices')
    plt.show()
    
    return {'model_name' : model_name,'model': model, 'rmse': rmse, 'mae': mae, 'r2': r2, 'cv_r2': cv_scores.mean()}


In [None]:
# Train all models
base_models  = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100,learning_rate=0.1, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100,learning_rate=0.1,force_col_wise=True, random_state=42),
    'CatBoost': CatBoostRegressor(n_estimators=100, random_state=42, verbose=False)
}

results = {}

for name, model in base_models.items():
    results[name] = evaluate_model(
        model,
        X_train_scaled_df , X_test_scaled_df ,
        y_train, y_test,
        name
    )

In [None]:
stacking = StackingRegressor(
    estimators=[
        ('rf', base_models['Random Forest']),
        ('xgb', base_models['XGBoost']),
        ('lgbm', base_models['LightGBM'])
    ],
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1
)
results['Stacking'] = evaluate_model(
    stacking,
    X_train_scaled_df, X_test_scaled_df,
    y_train, y_test,
    "Stacking (RF+XGB+LGBM)"
)

In [None]:
voting = VotingRegressor([
    ('rf', base_models['Random Forest']),
    ('xgb', base_models['XGBoost']),
    ('lgbm', base_models['LightGBM'])
])
results['Voting'] = evaluate_model(
    voting,
    X_train_scaled_df, X_test_scaled_df,
    y_train, y_test,
    "Voting (RF+XGB+LGBM)"
)


In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np

class WeightedAverageEnsemble(BaseEstimator, RegressorMixin):
    def __init__(self, models, weights):
        self.models = models
        self.weights = weights
        
    def fit(self, X, y):
        # Convert to DataFrame if numpy array (for LightGBM compatibility)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
            
        for name, model in self.models.items():
            model.fit(X, y)
        return self
        
    def predict(self, X):
        # Convert to DataFrame if numpy array
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
            
        predictions = np.zeros(X.shape[0])
        for (name, model), weight in zip(self.models.items(), self.weights):
            predictions += weight * model.predict(X)
        return predictions
    
    def get_params(self, deep=True):
        # Required for sklearn compatibility
        return {
            'models': self.models,
            'weights': self.weights
        }
    
    def set_params(self, **params):
        # Required for sklearn compatibility
        for parameter, value in params.items():
            setattr(self, parameter, value)
        return self

In [None]:
weighted_ensemble = WeightedAverageEnsemble(
    models={
        'xgb': xgb.XGBRegressor(n_estimators=100, random_state=42),
        'lgbm': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1),
        'rf': RandomForestRegressor(n_estimators=100, random_state=42)
    },
    weights=[0.4, 0.3, 0.3]
)

# Now works with evaluate_model()
results['WeightedAvg'] = evaluate_model(
    weighted_ensemble,
    X_train_scaled, X_test_scaled,  # Can pass numpy arrays or DataFrames
    y_train, y_test,
    "Weighted Average Ensemble"
)

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

stacking_poly = StackingRegressor(
    estimators=[
        ('rf', base_models['Random Forest']),
        ('xgb', base_models['XGBoost']),
        ('ca', base_models['CatBoost'])
    ],
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1
)
results['Stacking_Poly'] = evaluate_model(
    stacking_poly,
    X_train_poly, X_test_poly,
    y_train, y_test,
    "Stacking with Polynomial Features"
)

## 6. Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': [result['model_name'] for result in results.values()],
    'RMSE': [result['rmse'] for result in results.values()],
    'MAE': [result['mae'] for result in results.values()],
    'R2 Score': [result['r2'] for result in results.values()],
    'CV R2 Score': [result['cv_r2'] for result in results.values()]
})

print("Model Performance Comparison:")
display(comparison_df.sort_values('R2 Score', ascending=False))

# Visualize model comparison
metrics = ['RMSE', 'MAE', 'R2 Score', 'CV R2 Score']
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, metric in enumerate(metrics):
    sns.barplot(data=comparison_df, x='Model', y=metric, ax=axes[idx])
    axes[idx].set_title(f'Model Comparison - {metric}')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Feature Importance Comparison

In [None]:
def plot_feature_importance(model, model_name):
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        importance = np.abs(model.coef_)
    else:
        return None
    
    feature_imp = pd.DataFrame({
        'feature': X.columns,
        'importance': importance
    })
    feature_imp = feature_imp.sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_imp.head(10), x='importance', y='feature')
    plt.title(f'Top 10 Important Features - {model_name}')
    plt.tight_layout()
    plt.show()

# Plot feature importance for each model
for name, result in results.items():
    plot_feature_importance(result['model'], name)


## 8. Best Model Analysis

In [None]:
# Get the best performing model based on R2 score
best_model_name = comparison_df.loc[comparison_df['R2 Score'].idxmax(), 'Model']
best_model = results[best_model_name]['model']

print(f"Best performing model: {best_model_name}")

# Make predictions with the best model
y_pred_best = best_model.predict(X_test_scaled)

# Calculate prediction intervals for the best model (if it's not Linear Regression)
if isinstance(best_model, LinearRegression):
    print("\nPrediction intervals not available for Linear Regression")
else:
    # Create a sample prediction with confidence intervals
    sample_indices = np.random.choice(len(X_test), 5, replace=False)
    sample_X = X_test_scaled[sample_indices]
    sample_y = y_test.iloc[sample_indices]
    
    predictions = []
    for _ in range(100):
        if hasattr(best_model, 'random_state'):
            best_model.random_state = np.random.randint(1000)
        predictions.append(best_model.predict(sample_X))
    
    predictions = np.array(predictions)
    mean_pred = predictions.mean(axis=0)
    std_pred = predictions.std(axis=0)
    
    print("\nSample Predictions with Confidence Intervals:")
    for i in range(len(sample_X)):
        print(f"Actual: {sample_y.iloc[i]:,.0f}")
        print(f"Predicted: {mean_pred[i]:,.0f} +/- {2*std_pred[i]:,.0f}\n")