In [3]:
# House Price Prediction - Model Building
# Save this as model_building.ipynb (convert to notebook format)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

# Get the script directory and project root
script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
project_root = os.path.dirname(script_dir) if 'model' in script_dir else script_dir

# Create model directory if it doesn't exist
model_dir = os.path.join(project_root, 'model')
os.makedirs(model_dir, exist_ok=True)

print("=" * 60)
print("HOUSE PRICE PREDICTION - MODEL DEVELOPMENT")
print("=" * 60)
print(f"Working directory: {os.getcwd()}")
print(f"Project root: {project_root}")

# Load the dataset
print("\n1. Loading dataset...")
# Try multiple possible locations
dataset_paths = [
    os.path.join(project_root, 'train.csv'),
    'train.csv',
    os.path.join('..', 'train.csv')
]

df = None
for path in dataset_paths:
    if os.path.exists(path):
        print(f"Found dataset at: {path}")
        df = pd.read_csv(path)
        break

if df is None:
    raise FileNotFoundError("train.csv not found. Please place it in the project root directory.")
print(f"Dataset shape: {df.shape}")

# Select the required features
# Using 6 features: OverallQual, GrLivArea, TotalBsmtSF, GarageCars, YearBuilt, Neighborhood
selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood', 'SalePrice']
df = df[selected_features]

print(f"\nSelected features: {selected_features[:-1]}")
print(f"Target variable: SalePrice")

# Data Preprocessing
print("\n2. Data Preprocessing...")

# a. Handling missing values
print("\nMissing values before handling:")
print(df.isnull().sum())

# Fill numerical missing values with median
numerical_cols = ['TotalBsmtSF', 'GarageCars']
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# Fill categorical missing values with mode
if df['Neighborhood'].isnull().sum() > 0:
    df['Neighborhood'].fillna(df['Neighborhood'].mode()[0], inplace=True)

print("\nMissing values after handling:")
print(df.isnull().sum())

# b. Feature selection (already done above)

# c. Encoding categorical variables
print("\n3. Encoding categorical variable (Neighborhood)...")
le = LabelEncoder()
df['Neighborhood_Encoded'] = le.fit_transform(df['Neighborhood'])

# Save the label encoder for later use
encoder_path = os.path.join(model_dir, 'label_encoder.pkl')
joblib.dump(le, encoder_path)
print(f"Unique neighborhoods: {len(le.classes_)}")

# Drop the original Neighborhood column
df = df.drop('Neighborhood', axis=1)

# Prepare features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# d. Feature scaling - Not required for Random Forest
# Random Forest is tree-based and doesn't require feature scaling

# Split the data
print("\n4. Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Model Training
print("\n5. Training Random Forest Regressor...")
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("Model training completed!")

# Model Evaluation
print("\n6. Evaluating the model...")
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Training metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
train_mse = mean_squared_error(y_train, y_pred_train)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_pred_train)

# Test metrics
test_mae = mean_absolute_error(y_test, y_pred_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_pred_test)

print("\n" + "=" * 60)
print("MODEL EVALUATION METRICS")
print("=" * 60)
print("\nTraining Set Performance:")
print(f"  MAE:  ${train_mae:,.2f}")
print(f"  MSE:  ${train_mse:,.2f}")
print(f"  RMSE: ${train_rmse:,.2f}")
print(f"  R²:   {train_r2:.4f}")

print("\nTest Set Performance:")
print(f"  MAE:  ${test_mae:,.2f}")
print(f"  MSE:  ${test_mse:,.2f}")
print(f"  RMSE: ${test_rmse:,.2f}")
print(f"  R²:   {test_r2:.4f}")

# Feature Importance
print("\n7. Feature Importance:")
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print(feature_importance)

# Save the trained model
print("\n8. Saving the model...")
model_path = os.path.join(model_dir, 'house_price_model.pkl')
joblib.dump(model, model_path)
print(f"Model saved as '{model_path}'")

# Test model loading
print("\n9. Testing model reload...")
loaded_model = joblib.load(model_path)
test_prediction = loaded_model.predict(X_test[:5])
print("Model successfully reloaded!")
print(f"\nSample predictions: {test_prediction}")
print(f"Actual values: {y_test[:5].values}")

print("\n" + "=" * 60)
print("MODEL DEVELOPMENT COMPLETED SUCCESSFULLY!")
print("=" * 60)

HOUSE PRICE PREDICTION - MODEL DEVELOPMENT
Working directory: c:\Users\Pentho Sesi Segla\Documents\CU Documents\415files\HousePrice_project_Segla_Pentho_22CG031952\model
Project root: c:\Users\Pentho Sesi Segla\Documents\CU Documents\415files\HousePrice_project_Segla_Pentho_22CG031952

1. Loading dataset...
Found dataset at: c:\Users\Pentho Sesi Segla\Documents\CU Documents\415files\HousePrice_project_Segla_Pentho_22CG031952\train.csv
Dataset shape: (1460, 81)

Selected features: ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood']
Target variable: SalePrice

2. Data Preprocessing...

Missing values before handling:
OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
YearBuilt       0
Neighborhood    0
SalePrice       0
dtype: int64

Missing values after handling:
OverallQual     0
GrLivArea       0
TotalBsmtSF     0
GarageCars      0
YearBuilt       0
Neighborhood    0
SalePrice       0
dtype: int64

3. Encoding categorical variabl