# 03 - Baseline Linear Models

**Objective:** Train and evaluate baseline linear regression models

**Models:**
1. Linear Regression (OLS)
2. Ridge Regression (L2 regularization)
3. Lasso Regression (L1 regularization)
4. Elastic Net (L1 + L2)

**Input:** `data/X_train.csv`, `data/X_test.csv`, `data/y_train.csv`, `data/y_test.csv`

In [10]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
import json
import joblib
from datetime import datetime

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

print(f"Notebook run: {datetime.now().isoformat()}")

Notebook run: 2025-10-24T09:49:05.542930


In [None]:
# Load processed data (works on Windows and Linux/Amarel)
ROOT = Path.cwd()
DATA_DIR = ROOT / 'data'
MODELS_DIR = ROOT / 'models'
MODELS_DIR.mkdir(exist_ok=True)

X_train = pd.read_csv(DATA_DIR / 'X_train.csv')
X_test = pd.read_csv(DATA_DIR / 'X_test.csv')
y_train = pd.read_csv(DATA_DIR / 'y_train.csv')['ClosePrice']
y_test = pd.read_csv(DATA_DIR / 'y_test.csv')['ClosePrice']

print(f"Training: {X_train.shape}, Test: {X_test.shape}")

Training: (150311, 1022), Test: (22759, 1022)


In [13]:
# Scale features for linear models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled (StandardScaler)")

Features scaled (StandardScaler)


In [14]:
# Evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Comprehensive model evaluation"""
    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    results = {
        'model': model_name,
        'train_r2': r2_score(y_train, y_pred_train),
        'test_r2': r2_score(y_test, y_pred_test),
        'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'test_mae': mean_absolute_error(y_test, y_pred_test),
        'test_mdape': np.median(np.abs((y_test - y_pred_test) / y_test)) * 100
    }
    
    print(f"\n{model_name}:")
    print(f"  Train R²: {results['train_r2']:.4f}")
    print(f"  Test R²:  {results['test_r2']:.4f}")
    print(f"  RMSE:     ${results['test_rmse']:,.0f}")
    print(f"  MAE:      ${results['test_mae']:,.0f}")
    print(f"  MdAPE:    {results['test_mdape']:.2f}%")
    
    return results, model

baseline_results = []

## 1. Linear Regression (OLS)

In [15]:
lr = LinearRegression()
lr_results, lr_model = evaluate_model(lr, X_train_scaled, X_test_scaled, 
                                      y_train, y_test, "Linear Regression")
baseline_results.append(lr_results)


Linear Regression:
  Train R²: 0.6527
  Test R²:  0.6169
  RMSE:     $524,354
  MAE:      $319,024
  MdAPE:    38.31%


## 2. Ridge Regression (Tuned)

In [8]:
ridge_search = RandomizedSearchCV(
    Ridge(random_state=42),
    param_distributions={'alpha': loguniform(0.01, 100)},
    n_iter=10,  # Reduced from 20 to save memory
    cv=3,  # Reduced from 5 to save memory with large dataset
    scoring='r2',
    random_state=42,
    n_jobs=4  # Limit parallel jobs to reduce memory usage
)

ridge_search.fit(X_train_scaled, y_train)
best_ridge = ridge_search.best_estimator_

print(f"Best Ridge alpha: {ridge_search.best_params_['alpha']:.4f}")

ridge_results, ridge_model = evaluate_model(best_ridge, X_train_scaled, X_test_scaled,
                                           y_train, y_test, "Ridge (tuned)")
baseline_results.append(ridge_results)

Best Ridge alpha: 63.5122

Ridge (tuned):
  Train R²: 0.6527
  Test R²:  0.6170
  RMSE:     $524,259
  MAE:      $318,933
  MdAPE:    38.32%

Ridge (tuned):
  Train R²: 0.6527
  Test R²:  0.6170
  RMSE:     $524,259
  MAE:      $318,933
  MdAPE:    38.32%


## 3. Lasso Regression (Tuned)

In [None]:
lasso_search = RandomizedSearchCV(
    Lasso(random_state=42, max_iter=20000, tol=1e-4),
    param_distributions={'alpha': loguniform(0.01, 100)},
    n_iter=10,  # Reduced from 20 to save memory
    cv=3,  # Reduced from 5 to save memory with large dataset
    scoring='r2',
    random_state=42,
    n_jobs=4  # Limit parallel jobs to reduce memory usage
)

lasso_search.fit(X_train_scaled, y_train)
best_lasso = lasso_search.best_estimator_

print(f"Best Lasso alpha: {lasso_search.best_params_['alpha']:.4f}")

lasso_results, lasso_model = evaluate_model(best_lasso, X_train_scaled, X_test_scaled,
                                           y_train, y_test, "Lasso (tuned)")
baseline_results.append(lasso_results)

KeyboardInterrupt: 

## 4. Elastic Net (Tuned)

In [None]:
elastic_search = RandomizedSearchCV(
    ElasticNet(random_state=42, max_iter=20000, tol=1e-4),
    param_distributions={
        'alpha': loguniform(0.01, 100),
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    },
    n_iter=15,  # Reduced from 30 to save memory
    cv=3,  # Reduced from 5 to save memory with large dataset
    scoring='r2',
    random_state=42,
    n_jobs=4  # Limit parallel jobs to reduce memory usage
)

elastic_search.fit(X_train_scaled, y_train)
best_elastic = elastic_search.best_estimator_

print(f"Best Elastic Net params: {elastic_search.best_params_}")

elastic_results, elastic_model = evaluate_model(best_elastic, X_train_scaled, X_test_scaled,
                                               y_train, y_test, "Elastic Net (tuned)")
baseline_results.append(elastic_results)

Best Elastic Net params: {'alpha': np.float64(0.012366582530130834), 'l1_ratio': 0.5}

Elastic Net (tuned):
  Train R²: 0.9007
  Test R²:  0.6573
  RMSE:     $552,159
  MAE:      $303,438
  MdAPE:    19.15%

Elastic Net (tuned):
  Train R²: 0.9007
  Test R²:  0.6573
  RMSE:     $552,159
  MAE:      $303,438
  MdAPE:    19.15%


## Results Summary

In [None]:
results_df = pd.DataFrame(baseline_results).sort_values('test_r2', ascending=False)

print("\n" + "="*60)
print("BASELINE LINEAR MODELS SUMMARY")
print("="*60)
print(results_df.to_string(index=False))

# Save results
results_df.to_csv(MODELS_DIR / 'baseline_linear_results.csv', index=False)

# Save best model
best_model_name = results_df.iloc[0]['model']
best_r2 = results_df.iloc[0]['test_r2']

print(f"\nBest baseline model: {best_model_name} (R² = {best_r2:.4f})")

joblib.dump(scaler, MODELS_DIR / 'scaler_baseline.joblib')
joblib.dump(best_ridge if 'Ridge' in best_model_name else best_elastic, 
           MODELS_DIR / 'best_baseline_model.joblib')

print(f"\nModels saved to {MODELS_DIR}")
print("\n✅ Baseline linear models complete!")


BASELINE LINEAR MODELS SUMMARY
              model  train_r2  test_r2     test_rmse      test_mae  test_mdape
Elastic Net (tuned)  0.900662 0.657344 552159.461583 303437.925519   19.152770
      Ridge (tuned)  0.900719 0.655736 553453.567028 304351.106297   19.205854
  Linear Regression  0.900730 0.655412 553713.869116 304549.464485   19.196588
      Lasso (tuned)  0.900722 0.655377 553742.464251 304522.856574   19.204307

Best baseline model: Elastic Net (tuned) (R² = 0.6573)

Models saved to c:\Users\lpnhu\Downloads\home-price-prediction\models

✅ Baseline linear models complete!
              model  train_r2  test_r2     test_rmse      test_mae  test_mdape
Elastic Net (tuned)  0.900662 0.657344 552159.461583 303437.925519   19.152770
      Ridge (tuned)  0.900719 0.655736 553453.567028 304351.106297   19.205854
  Linear Regression  0.900730 0.655412 553713.869116 304549.464485   19.196588
      Lasso (tuned)  0.900722 0.655377 553742.464251 304522.856574   19.204307

Best baseline 