# House Price Prediction - Model Development
## PART A: Model Development

**Features Used:**
- OverallQual: Overall material and finish quality
- GrLivArea: Above grade living area (square feet)
- TotalBsmtSF: Total square feet of basement area
- GarageCars: Size of garage in car capacity
- YearBuilt: Original construction date
- Neighborhood: Physical location (categorical)

**Algorithm:** Random Forest Regressor

**Model Persistence:** Joblib

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load Dataset

In [None]:
# Load the training data
import os

# Check if running in Google Colab
try:
    from google.colab import drive
    IN_COLAB = True
    
    # Mount Google Drive
    drive.mount('/content/drive')
    
    # Set path to your data folder
    data_dir = '/content/drive/My Drive/csc331_data/house_pred'
    os.chdir(data_dir)
    print(f"✓ Changed to: {os.getcwd()}")
    
except ImportError:
    IN_COLAB = False
    print("Running locally - using current directory")

# Load the dataset
df = pd.read_csv('train.csv')

print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

Running in Google Colab
Please upload train.csv file:


KeyboardInterrupt: 

## 2. Feature Selection

Selecting 6 features from the recommended 9 features

In [None]:
# Select the 6 features + target variable (SalePrice)
selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood', 'SalePrice']
df_selected = df[selected_features].copy()

print(f"Selected features dataset shape: {df_selected.shape}")
print(f"\nData types:")
print(df_selected.dtypes)
print(f"\nBasic statistics:")
df_selected.describe()

## 3. Data Preprocessing

### 3a. Handling Missing Values

In [None]:
# Check for missing values
print("Missing values per feature:")
missing_values = df_selected.isnull().sum()
print(missing_values[missing_values > 0])

# Handle missing values
# For numerical features: fill with median
numerical_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt']
for feature in numerical_features:
    if df_selected[feature].isnull().sum() > 0:
        df_selected[feature].fillna(df_selected[feature].median(), inplace=True)

# For categorical features: fill with mode
if df_selected['Neighborhood'].isnull().sum() > 0:
    df_selected['Neighborhood'].fillna(df_selected['Neighborhood'].mode()[0], inplace=True)

# Check if missing values are handled
print(f"\nMissing values after handling: {df_selected.isnull().sum().sum()}")

### 3b. Exploratory Data Analysis

In [None]:
# Visualize the distribution of target variable
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(df_selected['SalePrice'], bins=50, edgecolor='black')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.title('Distribution of Sale Prices')

plt.subplot(1, 2, 2)
plt.hist(np.log1p(df_selected['SalePrice']), bins=50, edgecolor='black', color='green')
plt.xlabel('Log(Sale Price)')
plt.ylabel('Frequency')
plt.title('Distribution of Log-Transformed Sale Prices')

plt.tight_layout()
plt.show()

print(f"SalePrice - Mean: ${df_selected['SalePrice'].mean():,.2f}")
print(f"SalePrice - Median: ${df_selected['SalePrice'].median():,.2f}")
print(f"SalePrice - Std: ${df_selected['SalePrice'].std():,.2f}")

In [None]:
# Correlation analysis for numerical features
plt.figure(figsize=(10, 8))
correlation_matrix = df_selected[numerical_features + ['SalePrice']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Features')
plt.tight_layout()
plt.show()

print("\nCorrelation with SalePrice:")
print(correlation_matrix['SalePrice'].sort_values(ascending=False))

### 3c. Encoding Categorical Variables

In [None]:
# Encode the Neighborhood feature using Label Encoding
label_encoder = LabelEncoder()
df_selected['Neighborhood_Encoded'] = label_encoder.fit_transform(df_selected['Neighborhood'])

print(f"Number of unique neighborhoods: {df_selected['Neighborhood'].nunique()}")
print(f"\nNeighborhood encoding sample:")
print(df_selected[['Neighborhood', 'Neighborhood_Encoded']].drop_duplicates().head(10))

# Save the label encoder for later use
joblib.dump(label_encoder, 'neighborhood_encoder.pkl')
print("\n✓ Label encoder saved as 'neighborhood_encoder.pkl'")

### 3d. Feature Scaling

In [None]:
# Prepare features and target
X = df_selected[['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood_Encoded']]
y = df_selected['SalePrice']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Apply Standard Scaling to features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for better visualization
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("\nFeatures after scaling (first 5 rows):")
print(X_scaled_df.head())

# Save the scaler for later use
joblib.dump(scaler, 'feature_scaler.pkl')
print("\n✓ Feature scaler saved as 'feature_scaler.pkl'")

## 4. Train-Test Split

In [None]:
# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set price range: ${y_train.min():,.2f} - ${y_train.max():,.2f}")
print(f"Testing set price range: ${y_test.min():,.2f} - ${y_test.max():,.2f}")

## 5. Model Training

Using Random Forest Regressor for prediction

In [None]:
# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest Regressor...")
rf_model.fit(X_train, y_train)
print("✓ Model training completed!")

## 6. Model Evaluation

In [None]:
# Make predictions on both training and testing sets
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Calculate evaluation metrics for training set
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate evaluation metrics for testing set
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print("\nTRAINING SET:")
print(f"  Mean Absolute Error (MAE):  ${train_mae:,.2f}")
print(f"  Mean Squared Error (MSE):   ${train_mse:,.2f}")
print(f"  Root Mean Squared Error (RMSE): ${train_rmse:,.2f}")
print(f"  R² Score:                   {train_r2:.4f}")

print("\nTESTING SET:")
print(f"  Mean Absolute Error (MAE):  ${test_mae:,.2f}")
print(f"  Mean Squared Error (MSE):   ${test_mse:,.2f}")
print(f"  Root Mean Squared Error (RMSE): ${test_rmse:,.2f}")
print(f"  R² Score:                   {test_r2:.4f}")
print("="*60)

In [None]:
# Visualize predictions vs actual values
plt.figure(figsize=(14, 5))

# Training set
plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred, alpha=0.5, s=10)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title(f'Training Set: Actual vs Predicted\nR² = {train_r2:.4f}')
plt.grid(True, alpha=0.3)

# Testing set
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred, alpha=0.5, s=10, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title(f'Testing Set: Actual vs Predicted\nR² = {test_r2:.4f}')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance analysis
feature_names = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood']
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance Ranking:")
print(feature_importance.to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance in Random Forest Model')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

In [None]:
# Residual analysis
residuals = y_test - y_test_pred

plt.figure(figsize=(14, 5))

# Residual plot
plt.subplot(1, 2, 1)
plt.scatter(y_test_pred, residuals, alpha=0.5, s=10)
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True, alpha=0.3)

# Residual distribution
plt.subplot(1, 2, 2)
plt.hist(residuals, bins=50, edgecolor='black')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.axvline(x=0, color='r', linestyle='--', lw=2)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Save the Trained Model

Using Joblib for model persistence

In [None]:
# Save the trained model
model_filename = 'house_price_model.pkl'
joblib.dump(rf_model, model_filename)

print(f"✓ Model saved as '{model_filename}'")
print(f"\nModel file size: {joblib.os.path.getsize(model_filename) / 1024:.2f} KB")

## 8. Test Model Reloading

In [None]:
# Reload the model to verify it can be used without retraining
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load('feature_scaler.pkl')
loaded_encoder = joblib.load('neighborhood_encoder.pkl')

print("✓ Model, scaler, and encoder successfully reloaded!")

# Test prediction with the loaded model
sample_prediction = loaded_model.predict(X_test[:5])
print("\nSample predictions from reloaded model:")
for i, (actual, predicted) in enumerate(zip(y_test.values[:5], sample_prediction), 1):
    print(f"  House {i}: Actual = ${actual:,.2f}, Predicted = ${predicted:,.2f}, Difference = ${abs(actual - predicted):,.2f}")

## 9. Summary

**Model Development Completed Successfully!**

- **Dataset:** House Prices - Advanced Regression Techniques
- **Features Used:** 6 features (OverallQual, GrLivArea, TotalBsmtSF, GarageCars, YearBuilt, Neighborhood)
- **Algorithm:** Random Forest Regressor
- **Model Persistence:** Joblib
- **Files Generated:**
  - `house_price_model.pkl` - Trained model
  - `feature_scaler.pkl` - StandardScaler for feature scaling
  - `neighborhood_encoder.pkl` - LabelEncoder for Neighborhood encoding

The model is ready to be integrated into a web application!