In [20]:
# Import packages and data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a dataframe
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [21]:
# Train/Test split
from sklearn.model_selection import train_test_split
X = df[["sepal length (cm)", "sepal width (cm)", "petal width (cm)"]].values
y = df["petal length (cm)"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,        # 20% of the data reserved for the test set
    random_state=412,      # Fix the random seed for reproducibility
)


# Simple test for single model with cross-validation
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score # Package for single model
reg = LinearRegression()
cv_scores = cross_val_score(reg, X_train, y_train, cv=5, scoring="r2")
print("5-fold CV R^2 scores:", cv_scores)
print("Average R^2:", np.mean(cv_scores))

5-fold CV R^2 scores: [0.94771545 0.96705002 0.98205548 0.96174885 0.97299906]
Average R^2: 0.9663137725431324


In [22]:
# Find the best alpha and degree for different models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(max_iter=10000)
}
alpha_range = np.logspace(-3, 2, 10)    # Set alpha from 0.001 to 100
degree_range = [1, 2, 3, 4]             # Set degree from 1 to 4

best_params = {}
best_scores = {}
best_mse = {}


for name, model in models.items():
    pipeline = Pipeline([
        ("imp_mean", SimpleImputer(strategy="mean")),   # Imputer NaN by 
        ("poly", PolynomialFeatures()), # Package for finding the best degree
        ("scaler", StandardScaler()),   # Standardize features after poly
        ("regressor", model)
    ])

    # Set the final pipeline for different models
    if name == "Linear":
        param_grid = {"poly__degree": degree_range}
    else:
        param_grid = {
            "poly__degree": degree_range,
            "regressor__alpha": alpha_range
        }
    
    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring="r2")
    grid.fit(X_train, y_train)
    
    best_params[name] = {k: f"{v:.4f}" if isinstance(v, np.float64) else v
                     for k, v in grid.best_params_.items()}
    best_scores[name] = grid.best_score_

    y_pred_test = grid.predict(X_test)
    best_mse[name] = mean_squared_error(y_test, y_pred_test)


# Show the results
for name in models.keys():
    print(f"{name} best params: {best_params[name]}, "
          f"Best CV R2: {best_scores[name]:.4f}, "
          f"Test MSE: {best_mse[name]:.4f}")

Linear best params: {'poly__degree': 2}, Best CV R2: 0.9707, Test MSE: 0.1186
Ridge best params: {'poly__degree': 4, 'regressor__alpha': '0.0464'}, Best CV R2: 0.9780, Test MSE: 0.1274
Lasso best params: {'poly__degree': 4, 'regressor__alpha': '0.0010'}, Best CV R2: 0.9767, Test MSE: 0.1168
