# ðŸ“ˆ Linear Regression â€” EDA & Modeling Notebook

This notebook walks through:
1. Exploratory Data Analysis
2. From-Scratch OLS vs Gradient Descent
3. Scikit-learn with Regularization
4. Full Diagnostic Suite
5. Model Comparison

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing

%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 120

print('Setup complete âœ“')

## 1. Load & Explore Data

In [None]:
housing = fetch_california_housing(as_frame=True)
df = housing.frame

print(f'Shape: {df.shape}')
print(f'\nTarget: MedHouseVal (median house value in $100K)')
df.head()

In [None]:
df.describe().round(3)

In [None]:
# Distribution of target variable
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['MedHouseVal'], bins=50, edgecolor='white', alpha=0.8, color='#2563eb')
axes[0].set_xlabel('Median House Value ($100K)')
axes[0].set_title('Target Distribution')

corr = df.corr()['MedHouseVal'].drop('MedHouseVal').sort_values()
corr.plot(kind='barh', ax=axes[1], color=['#dc2626' if v < 0 else '#2563eb' for v in corr])
axes[1].set_title('Feature Correlations with Target')
axes[1].set_xlabel('Pearson Correlation')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='RdBu_r', center=0, square=True)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 2. Preprocessing & Modeling

In [None]:
import yaml
from src.data.loader import create_dataset
from src.data.preprocessor import Preprocessor
from src.models.linear_regression_scratch import LinearRegressionScratch
from src.models.linear_regression_sklearn import LinearRegressionSklearn
from src.evaluation.metrics import compute_all_metrics
from src.evaluation.diagnostics import run_diagnostics
from src.evaluation import visualizations as viz

# Load config
with open('../configs/config.yaml') as f:
    config = yaml.safe_load(f)

# Create dataset
dataset = create_dataset(config)
preprocessor = Preprocessor(config)
dataset = preprocessor.fit_transform(dataset)

print(dataset.summary())

### 2a. OLS (Normal Equation)

In [None]:
ols_model = LinearRegressionScratch(method='ols')
ols_model.fit(dataset.X_train, dataset.y_train)

y_pred_ols = ols_model.predict(dataset.X_test)
ols_metrics = compute_all_metrics(dataset.y_test, y_pred_ols, dataset.n_features)

print('\nOLS Coefficients:')
for name, w in zip(dataset.feature_names, ols_model.weights):
    print(f'  {name:>15s}: {w:+.4f}')
print(f'  {"intercept":>15s}: {ols_model.bias:+.4f}')

### 2b. Gradient Descent

In [None]:
gd_model = LinearRegressionScratch(
    method='gradient_descent',
    learning_rate=0.01,
    max_iterations=10000,
)
gd_model.fit(dataset.X_train, dataset.y_train)

y_pred_gd = gd_model.predict(dataset.X_test)
gd_metrics = compute_all_metrics(dataset.y_test, y_pred_gd, dataset.n_features)

# Convergence plot
plt.figure(figsize=(10, 4))
plt.plot(gd_model.cost_history, color='#2563eb', linewidth=2)
plt.xlabel('Iteration (Ã—100)')
plt.ylabel('MSE Loss')
plt.title('Gradient Descent Convergence')
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()

### 2c. Ridge Regression (sklearn)

In [None]:
ridge_config = {
    'model': {
        'method': 'ridge',
        'regularization': {
            'alpha_search': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
            'cv_folds': 5,
        }
    }
}

ridge_model = LinearRegressionSklearn(ridge_config)
ridge_model.fit(dataset.X_train, dataset.y_train)

y_pred_ridge = ridge_model.predict(dataset.X_test)
ridge_metrics = compute_all_metrics(dataset.y_test, y_pred_ridge, dataset.n_features)

print(f'Best alpha: {ridge_model.best_params}')
print('\nFeature Importance:')
for name, coef in ridge_model.get_feature_importance(dataset.feature_names):
    print(f'  {name:>15s}: {coef:+.4f}')

## 3. Model Comparison

In [None]:
comparison = pd.DataFrame({
    'OLS (scratch)': ols_metrics,
    'Gradient Descent': gd_metrics,
    'Ridge (sklearn)': ridge_metrics,
}).round(4)

comparison.style.highlight_min(axis=1, subset=['mse', 'rmse', 'mae'], props='background-color: #d4edda') \
               .highlight_max(axis=1, subset=['r2', 'adj_r2'], props='background-color: #d4edda')

## 4. Diagnostics

In [None]:
report = run_diagnostics(
    dataset.X_test, dataset.y_test, y_pred_ols, dataset.feature_names
)

# Diagnostic dashboard
fig = viz.create_diagnostic_dashboard(
    dataset.y_test, y_pred_ols, report.cooks_distance
)
plt.show()

In [None]:
# Actual vs Predicted
fig = viz.plot_actual_vs_predicted(dataset.y_test, y_pred_ols)
plt.show()

In [None]:
# Feature coefficients
fig = viz.plot_coefficient_bar(dataset.feature_names, ols_model.weights)
plt.show()