# Notebook 04: Linear Models with Parameter Simulation

Deep dive into linear models with comprehensive parameter exploration.

## Learning Objectives
- Understand linear regression fundamentals
- Explore regularization (Ridge, Lasso, ElasticNet)
- Simulate effects of noise, outliers, and multicollinearity
- Visualize decision boundaries for classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet,
    LogisticRegression, RidgeClassifier
)
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

## Part 1: Linear Regression Fundamentals

In [None]:
# Generate simple linear data
n_samples = 200
X = np.random.uniform(-3, 3, n_samples).reshape(-1, 1)
y_true = 2 * X.ravel() + 1  # y = 2x + 1
y = y_true + np.random.randn(n_samples) * 0.5  # Add noise

# Fit linear regression
model = LinearRegression()
model.fit(X, y)

# Predict
X_line = np.linspace(-3, 3, 100).reshape(-1, 1)
y_pred = model.predict(X_line)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, label='Data')
plt.plot(X_line, y_pred, 'r-', linewidth=2, label=f'y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}')
plt.plot(X_line, 2 * X_line + 1, 'g--', linewidth=1, label='True: y = 2x + 1')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression')
plt.legend()
plt.show()

print(f"Learned coefficient: {model.coef_[0]:.4f} (true: 2.0)")
print(f"Learned intercept: {model.intercept_:.4f} (true: 1.0)")
print(f"R² score: {model.score(X, y):.4f}")

## Part 2: Effect of Noise Level

In [None]:
# Simulate different noise levels
noise_levels = [0.1, 0.5, 1.0, 2.0]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

results = []

for idx, noise in enumerate(noise_levels):
    # Generate data
    X = np.random.uniform(-3, 3, n_samples).reshape(-1, 1)
    y = 2 * X.ravel() + 1 + np.random.randn(n_samples) * noise
    
    # Fit model
    model = LinearRegression()
    model.fit(X, y)
    
    # Predict
    y_pred = model.predict(X_line)
    r2 = model.score(X, y)
    
    results.append({
        'noise': noise,
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'r2': r2
    })
    
    # Plot
    axes[idx].scatter(X, y, alpha=0.6)
    axes[idx].plot(X_line, y_pred, 'r-', linewidth=2)
    axes[idx].plot(X_line, 2 * X_line + 1, 'g--', linewidth=1)
    axes[idx].set_title(f'Noise = {noise}, R² = {r2:.3f}')
    axes[idx].set_xlabel('X')
    axes[idx].set_ylabel('y')

plt.tight_layout()
plt.show()

# Summary table
df_results = pd.DataFrame(results)
print("\nEffect of Noise on Model Performance:")
print(df_results.to_string(index=False))

## Part 3: Regularization - Ridge, Lasso, ElasticNet

In [None]:
# Generate high-dimensional data with some irrelevant features
X, y, coef_true = make_regression(
    n_samples=200, n_features=50, n_informative=10,
    noise=10, coef=True, random_state=42
)

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"True non-zero coefficients: {np.sum(coef_true != 0)}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Features: {X_train.shape[1]}")

In [None]:
# Compare regularization methods
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5)
}

results = []
coefs = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    
    train_score = model.score(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    n_nonzero = np.sum(np.abs(model.coef_) > 0.01)
    
    results.append({
        'Model': name,
        'Train R²': train_score,
        'Test R²': test_score,
        'Non-zero coefs': n_nonzero
    })
    coefs[name] = model.coef_

df_results = pd.DataFrame(results)
print("Model Comparison:")
print(df_results.to_string(index=False))

In [None]:
# Visualize coefficient magnitudes
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for idx, (name, coef) in enumerate(coefs.items()):
    axes[idx].bar(range(len(coef)), coef, alpha=0.7)
    axes[idx].axhline(y=0, color='k', linestyle='-', linewidth=0.5)
    axes[idx].set_xlabel('Feature Index')
    axes[idx].set_ylabel('Coefficient')
    axes[idx].set_title(f'{name} Coefficients')

plt.tight_layout()
plt.show()

## Part 4: Alpha Parameter Sweep

In [None]:
# Sweep alpha values for Ridge
alphas = np.logspace(-4, 4, 50)

ridge_train_scores = []
ridge_test_scores = []
ridge_coef_norms = []

for alpha in alphas:
    model = Ridge(alpha=alpha)
    model.fit(X_train_scaled, y_train)
    
    ridge_train_scores.append(model.score(X_train_scaled, y_train))
    ridge_test_scores.append(model.score(X_test_scaled, y_test))
    ridge_coef_norms.append(np.linalg.norm(model.coef_))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scores vs alpha
axes[0].semilogx(alphas, ridge_train_scores, 'b-', label='Train')
axes[0].semilogx(alphas, ridge_test_scores, 'r-', label='Test')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('R² Score')
axes[0].set_title('Ridge: Score vs Alpha')
axes[0].legend()
axes[0].grid(True)

# Coefficient norm vs alpha
axes[1].loglog(alphas, ridge_coef_norms, 'g-')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Coefficient L2 Norm')
axes[1].set_title('Ridge: Coefficient Shrinkage')
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Find optimal alpha
optimal_idx = np.argmax(ridge_test_scores)
print(f"Optimal alpha: {alphas[optimal_idx]:.4f}")
print(f"Best test R²: {ridge_test_scores[optimal_idx]:.4f}")

In [None]:
# Sweep alpha for Lasso - observe sparsity
lasso_n_nonzero = []
lasso_test_scores = []

for alpha in alphas:
    model = Lasso(alpha=alpha, max_iter=10000)
    model.fit(X_train_scaled, y_train)
    
    lasso_test_scores.append(model.score(X_test_scaled, y_test))
    lasso_n_nonzero.append(np.sum(np.abs(model.coef_) > 1e-6))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].semilogx(alphas, lasso_test_scores, 'r-')
axes[0].set_xlabel('Alpha')
axes[0].set_ylabel('Test R² Score')
axes[0].set_title('Lasso: Test Score vs Alpha')
axes[0].grid(True)

axes[1].semilogx(alphas, lasso_n_nonzero, 'b-')
axes[1].set_xlabel('Alpha')
axes[1].set_ylabel('Number of Non-zero Coefficients')
axes[1].set_title('Lasso: Feature Selection')
axes[1].grid(True)

plt.tight_layout()
plt.show()

## Part 5: Effect of Multicollinearity

In [None]:
# Generate data with correlated features
n_samples = 200

# Independent features
X1 = np.random.randn(n_samples)
X2 = np.random.randn(n_samples)

# Correlated feature (X3 ≈ X1 + noise)
X3 = X1 + np.random.randn(n_samples) * 0.1  # Highly correlated with X1

# Create target
y = 3 * X1 + 2 * X2 + np.random.randn(n_samples) * 0.5

# Compare models with/without correlated feature
X_no_corr = np.column_stack([X1, X2])
X_with_corr = np.column_stack([X1, X2, X3])

print(f"Correlation between X1 and X3: {np.corrcoef(X1, X3)[0, 1]:.4f}")

In [None]:
# Compare Linear Regression vs Ridge with multicollinearity
from sklearn.model_selection import cross_val_score

models_mc = {
    'Linear (no corr)': (LinearRegression(), X_no_corr),
    'Linear (with corr)': (LinearRegression(), X_with_corr),
    'Ridge (with corr)': (Ridge(alpha=1.0), X_with_corr)
}

print("Effect of Multicollinearity:")
print("="*60)

for name, (model, X_data) in models_mc.items():
    # Scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_data)
    
    # Fit
    model.fit(X_scaled, y)
    
    # CV score
    cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')
    
    print(f"\n{name}:")
    print(f"  Coefficients: {model.coef_}")
    print(f"  CV R² Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

## Part 6: Effect of Outliers

In [None]:
# Generate data with outliers
n_samples = 100
X = np.random.uniform(-3, 3, n_samples).reshape(-1, 1)
y = 2 * X.ravel() + 1 + np.random.randn(n_samples) * 0.3

# Add outliers
n_outliers = 5
outlier_idx = np.random.choice(n_samples, n_outliers, replace=False)
y_outliers = y.copy()
y_outliers[outlier_idx] += np.random.uniform(5, 10, n_outliers) * np.sign(np.random.randn(n_outliers))

# Compare models
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

X_line = np.linspace(-3, 3, 100).reshape(-1, 1)

for idx, (y_data, title) in enumerate([(y, 'Without Outliers'), (y_outliers, 'With Outliers')]):
    # Fit models
    lr = LinearRegression().fit(X, y_data)
    
    # Use Huber loss for robust regression
    from sklearn.linear_model import HuberRegressor
    huber = HuberRegressor().fit(X, y_data)
    
    # Plot
    axes[idx].scatter(X, y_data, alpha=0.6)
    if idx == 1:
        axes[idx].scatter(X[outlier_idx], y_data[outlier_idx], c='red', s=100, marker='x', label='Outliers')
    
    axes[idx].plot(X_line, lr.predict(X_line), 'b-', lw=2, label=f'Linear (coef={lr.coef_[0]:.2f})')
    axes[idx].plot(X_line, huber.predict(X_line), 'g--', lw=2, label=f'Huber (coef={huber.coef_[0]:.2f})')
    axes[idx].plot(X_line, 2 * X_line + 1, 'k:', lw=1, label='True')
    
    axes[idx].set_xlabel('X')
    axes[idx].set_ylabel('y')
    axes[idx].set_title(title)
    axes[idx].legend()

plt.tight_layout()
plt.show()

## Part 7: Polynomial Regression

In [None]:
# Generate non-linear data
n_samples = 100
X = np.sort(np.random.uniform(-3, 3, n_samples)).reshape(-1, 1)
y = 0.5 * X.ravel()**3 - 2 * X.ravel()**2 + X.ravel() + np.random.randn(n_samples) * 2

# Compare different polynomial degrees
degrees = [1, 2, 3, 5, 10]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

X_plot = np.linspace(-3, 3, 100).reshape(-1, 1)

for idx, degree in enumerate(degrees):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X)
    X_plot_poly = poly.transform(X_plot)
    
    # Fit model
    model = LinearRegression()
    model.fit(X_poly, y)
    
    # Predict
    y_pred = model.predict(X_plot_poly)
    
    # Calculate scores
    train_score = model.score(X_poly, y)
    
    # Plot
    axes[idx].scatter(X, y, alpha=0.6)
    axes[idx].plot(X_plot, y_pred, 'r-', lw=2)
    axes[idx].set_xlabel('X')
    axes[idx].set_ylabel('y')
    axes[idx].set_title(f'Degree {degree} (R² = {train_score:.3f})')
    axes[idx].set_ylim(-30, 30)

axes[-1].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# Polynomial with regularization to prevent overfitting
degree = 10
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X)
X_plot_poly = poly.transform(X_plot)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

alphas_poly = [0, 0.01, 1.0]

for idx, alpha in enumerate(alphas_poly):
    if alpha == 0:
        model = LinearRegression()
    else:
        model = Ridge(alpha=alpha)
    
    model.fit(X_poly, y)
    y_pred = model.predict(X_plot_poly)
    
    axes[idx].scatter(X, y, alpha=0.6)
    axes[idx].plot(X_plot, y_pred, 'r-', lw=2)
    axes[idx].set_xlabel('X')
    axes[idx].set_ylabel('y')
    axes[idx].set_title(f'Degree {degree}, Alpha = {alpha}')
    axes[idx].set_ylim(-30, 30)

plt.tight_layout()
plt.show()

## Part 8: Logistic Regression for Classification

In [None]:
# Generate classification data
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=300, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Visualize data
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Classification Data')
plt.show()

In [None]:
# Decision boundary function
def plot_decision_boundary(model, X, y, ax, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black', s=30)
    ax.set_title(title)

# Compare regularization in logistic regression
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

C_values = [0.01, 1.0, 100.0]  # C = 1/alpha (inverse regularization)

for idx, C in enumerate(C_values):
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train_scaled, y_train)
    
    acc = model.score(X_test_scaled, y_test)
    plot_decision_boundary(model, X_train_scaled, y_train, axes[idx], 
                          f'C = {C} (Acc = {acc:.3f})')

plt.tight_layout()
plt.show()

In [None]:
# C parameter sweep
C_range = np.logspace(-3, 3, 30)

train_scores = []
test_scores = []

for C in C_range:
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train_scaled, y_train)
    
    train_scores.append(model.score(X_train_scaled, y_train))
    test_scores.append(model.score(X_test_scaled, y_test))

plt.figure(figsize=(10, 6))
plt.semilogx(C_range, train_scores, 'b-', label='Train')
plt.semilogx(C_range, test_scores, 'r-', label='Test')
plt.xlabel('C (Inverse Regularization)')
plt.ylabel('Accuracy')
plt.title('Logistic Regression: Effect of Regularization')
plt.legend()
plt.grid(True)
plt.show()

optimal_C = C_range[np.argmax(test_scores)]
print(f"Optimal C: {optimal_C:.4f}")

## Summary

In this notebook, you learned:

### Linear Regression
- Basic linear regression fitting
- Effect of noise on model performance

### Regularization
- **Ridge**: L2 penalty, shrinks coefficients
- **Lasso**: L1 penalty, feature selection (sparse)
- **ElasticNet**: Combination of L1 and L2

### Parameter Effects
- Higher alpha = more regularization = simpler model
- Regularization helps with multicollinearity
- Polynomial features + regularization prevents overfitting

### Key Takeaways
- Use Ridge when all features might be relevant
- Use Lasso for automatic feature selection
- Regularization is crucial for high-dimensional data
- Always tune the regularization parameter (alpha/C)

### Next Steps
Continue to **Notebook 05** for tree and ensemble models.