### Part a : Ordinary Least Square (OLS) for the Runge function

*Write your own code* (using for example the  pseudoinverse function **pinv** from  **Numpy** ) and perform a standard **ordinary least square regression**
analysis using polynomials in $x$ up to  order $15$ or higher. Explore the dependence on the number of data points and the polynomial degree.

Evaluate the mean Squared error (MSE)

---
answer


In [10]:
import sys
sys.path.append('../code')

from data_processing import RungeData, create_polynomial_features
from regression_models import OLSRegression
from evaluation_metrics import mse, r2_score

runge_data = RungeData(n_points=100)
runge_data.add_noise()
runge_data.scale()

X_train, X_test, y_train, y_test = runge_data.split()

degree = 5
X_train_poly = create_polynomial_features(X_train, degree)
X_test_poly = create_polynomial_features(X_test, degree)

model = OLSRegression()
model.fit(X_train_poly, y_train, method='pinv')

y_pred = model.predict(X_test_poly)

print(f"Degree {degree}:")
print(f"MSE: {mse(y_test, y_pred):.6f}")
print(f"R²: {r2_score(y_test, y_pred):.6f}")
print(f"Parameters: {model.theta[:3]}...")

Degree 5:
MSE: 0.447822
R²: -0.056579
Parameters: [ 0.14532937 -0.32074898 -0.35453549]...


### Comments

Todo:

Appropriate explaination on OLS scaling invariance from
https://github.com/viktorbgulbrandsen/fysstk3155/blob/main/uke38/exercises.ipynb



### Part b : Ridge regression for the Runge function

Ridge regression with different regularization methods for comparison:

In [11]:
from data_processing import create_polynomial_features
from regression_models import RidgeRegression
import numpy as np

ridge_data = RungeData(n_points=100)
ridge_data.add_noise()
ridge_data.scale()

X_train, X_test, y_train, y_test = ridge_data.split()

degree = 5
X_train_poly = create_polynomial_features(X_train, degree)
X_test_poly = create_polynomial_features(X_test, degree)

lambdas = [0.001, 0.1, 1.0, 10.0, 100.0]

print("Ridge Regression Results:")
print("Lambda\t\tMSE\t\tR²")
print("-" * 35)

for lmbda in lambdas:
    ridge_model = RidgeRegression()
    ridge_model.fit(X_train_poly, y_train, lmbda=lmbda)
    
    y_pred = ridge_model.predict(X_test_poly)
    
    mse_val = mse(y_test, y_pred)
    r2_val = r2_score(y_test, y_pred)
    
    print(f"{lmbda:.3f}\t\t{mse_val:.6f}\t{r2_val:.6f}")

print("\nCompare with OLS:")
ols_model = OLSRegression()
ols_model.fit(X_train_poly, y_train, method='pinv')
y_pred_ols = ols_model.predict(X_test_poly)

print(f"OLS\t\t{mse(y_test, y_pred_ols):.6f}\t{r2_score(y_test, y_pred_ols):.6f}")

ridge_small = RidgeRegression()
ridge_small.fit(X_train_poly, y_train, lmbda=0.1)

ridge_large = RidgeRegression()
ridge_large.fit(X_train_poly, y_train, lmbda=10.0)

print(f"\nParameter magnitude comparison:")
print(f"OLS max |θ|: {np.max(np.abs(ols_model.theta)):.4f}")
print(f"Ridge (λ=0.1) max |θ|: {np.max(np.abs(ridge_small.theta)):.4f}")
print(f"Ridge (λ=10.0) max |θ|: {np.max(np.abs(ridge_large.theta)):.4f}")

Ridge Regression Results:
Lambda		MSE		R²
-----------------------------------
0.001		0.447816	-0.056565
0.100		0.447356	-0.055479
1.000		0.447420	-0.055629
10.000		0.450419	-0.062707
100.000		0.438751	-0.035176

Compare with OLS:
OLS		0.447822	-0.056579

Parameter magnitude comparison:
OLS max |θ|: 0.7787
Ridge (λ=0.1) max |θ|: 0.7294
Ridge (λ=10.0) max |θ|: 0.1463


In [12]:
print("\nRidge method comparison (λ=1.0):")
print("Method\t\tMSE\t\tR²")
print("-" * 35)

methods = ['analytical', 'cholesky']

for method in methods:
    ridge_method = RidgeRegression()
    ridge_method.fit(X_train_poly, y_train, lmbda=1.0, method=method)
    
    y_pred_method = ridge_method.predict(X_test_poly)
    
    mse_val = mse(y_test, y_pred_method)
    r2_val = r2_score(y_test, y_pred_method)
    
    print(f"{method}\t{mse_val:.6f}\t{r2_val:.6f}")
    
print("\nNote: Both methods should give identical results")
print("Cholesky method using np.linalg.solve() is more numerically stable")


Ridge method comparison (λ=1.0):
Method		MSE		R²
-----------------------------------
analytical	0.447420	-0.055629
cholesky	0.447420	-0.055629

Note: Both methods should give identical results
Cholesky method using np.linalg.solve() is more numerically stable


In [13]:
from data_processing import create_polynomial_features
from regression_models import LassoRegression, RidgeRegression
from gradient_descent import GradientDescent
import numpy as np

lasso_data = RungeData(n_points=100)
lasso_data.add_noise()
lasso_data.scale()

X_train, X_test, y_train, y_test = lasso_data.split()

degree = 5
X_train_poly = create_polynomial_features(X_train, degree)
X_test_poly = create_polynomial_features(X_test, degree)

lambdas = [0.001, 0.01, 0.1, 1.0, 10.0]

print("LASSO Regression Results:")
print("Lambda\t\tMSE\t\tR2\t\tNon-zero params")
print("-" * 50)

for lmbda in lambdas:
    lasso = LassoRegression(lmbda=lmbda)
    optimizer = GradientDescent()
    
    optimizer.optimize(lasso, X_train_poly, y_train, learning_rate=0.01, max_iter=1000)
    
    y_pred = lasso.predict(X_test_poly)
    
    mse_val = mse(y_test, y_pred)
    r2_val = r2_score(y_test, y_pred)
    non_zero_params = np.sum(np.abs(lasso.theta) > 1e-4)
    
    print(f"{lmbda:.3f}\t\t{mse_val:.6f}\t{r2_val:.6f}\t{non_zero_params}")

print(f"\nParameter sparsity comparison (λ=0.1):")

lasso_sparse = LassoRegression(lmbda=0.1)
optimizer.optimize(lasso_sparse, X_train_poly, y_train)

ridge_compare = RidgeRegression()
ridge_compare.fit(X_train_poly, y_train, lmbda=0.1)

print(f"LASSO max |θ|: {np.max(np.abs(lasso_sparse.theta)):.4f}")
print(f"LASSO non-zero params: {np.sum(np.abs(lasso_sparse.theta) > 1e-4)}")
print(f"Ridge max |θ|: {np.max(np.abs(ridge_compare.theta)):.4f}") 
print(f"Ridge non-zero params: {np.sum(np.abs(ridge_compare.theta) > 1e-4)}")

print(f"\nNote: LASSO sets many parameters to exactly zero (feature selection)")
print(f"Ridge shrinks parameters but keeps them non-zero")

LASSO Regression Results:
Lambda		MSE		R2		Non-zero params
--------------------------------------------------
0.001		0.454026	-0.071216	6
0.010		0.459765	-0.084757	6
0.100		0.438133	-0.033719	6
1.000		0.436964	-0.030961	6
10.000		0.468867	-0.106232	6

Parameter sparsity comparison (λ=0.1):
LASSO max |θ|: 0.0198
LASSO non-zero params: 6
Ridge max |θ|: 0.7294
Ridge non-zero params: 6

Note: LASSO sets many parameters to exactly zero (feature selection)
Ridge shrinks parameters but keeps them non-zero
