### Lab 5 - PRAMOD KUNJUKUNJU SAJI - 8856432

In [26]:
import pandas as pd
import numpy as np
from typing import List
from sklearn import datasets
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

##### 1. Utilize the diabetes dataset from lab 4. Perform cross-validation on nine polynomial models, ranging from degree 0 to 8.

In [27]:
X, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)

In [28]:
def create_polynomial_models(X: np.ndarray, y: np.ndarray, degrees: List[int]) -> dict[int, np.poly1d]:
    """
    Creates polynomial models for the given degrees and fits them to the given data.

    Args
        X: The x values of the data.
        y: The y values of the data.
        degrees: A list of polynomial degrees to try.

    Returns:
        A dictionary of polynomial models, with the polynomial degree as the key.
    """

    models = {}
    for degree in degrees:
        model = Pipeline([('polynomial', PolynomialFeatures(degree=degree)),
                          ('linear', LinearRegression())])
        model.fit(X, y)
        models[degree] = model
    return models

In [29]:
# Create models with degrees ranging from 0 to 9

degrees = list(range(0, 9))

models = create_polynomial_models(X, y, degrees)
print(models)

{0: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=0)),
                ('linear', LinearRegression())]), 1: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                ('linear', LinearRegression())]), 2: Pipeline(steps=[('polynomial', PolynomialFeatures()),
                ('linear', LinearRegression())]), 3: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=3)),
                ('linear', LinearRegression())]), 4: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=4)),
                ('linear', LinearRegression())]), 5: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=5)),
                ('linear', LinearRegression())]), 6: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=6)),
                ('linear', LinearRegression())]), 7: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=7)),
                ('linear', LinearRegression())]), 8: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=8)),
                ('line

##### 2. Construct a table summarizing the cross-validation results. Each model should have a separate row in the table. Include the R-Squared and Mean Absolute Error (MAE) metrics for each model. Calculate the mean value and standard deviation of these metrics from the cross-validation. Include both values.

In [39]:
output = pd.DataFrame(columns=['Degree', 'Mean of R-Squared', 'Std of R-Squared', 'Mean of MAE', 'Std of MAE'])

for degree, model in models.items():
    # Performing cross-validation
    crossValidation = cross_validate(model, X, y, cv=4, scoring=['r2', 'neg_mean_absolute_error'])
    
    # Calculating mean and standard deviation
    r2Mean = np.mean(crossValidation['test_r2'])
    r2Std = np.std(crossValidation['test_r2'])
    maeMean = -np.mean(crossValidation['test_neg_mean_absolute_error'])
    maeStd = np.std(crossValidation['test_neg_mean_absolute_error'])
    
    # Calcualting combined score
    combinedScore = r2Mean - r2Std + (1 / maeMean) - (1 / maeStd)

    # Add the results to the DataFrame
    newRow = pd.DataFrame({
        'Degree': [degree],
        'Mean of R-Squared': [f"{r2Mean:.3f}"],
        'Std of R-Squared': [f"{r2Std:.3f}"],
        'Mean of MAE':[f"{maeMean:.3f}"],
        'Std of MAE':[f"{maeStd:.3f}"],
        'R-Squared': [crossValidation['test_r2']],
        'Combined Score': [combinedScore],
        'Mean Absolute Error': [crossValidation['test_neg_mean_absolute_error']]
    })
    output = pd.concat([output, newRow], ignore_index=True)
print(output)
    

  Degree Mean of R-Squared Std of R-Squared Mean of MAE Std of MAE   
0      0            -0.038            0.046      66.263      3.061  \
1      1             0.485            0.067      44.065      1.992   
2      2            -0.720            1.127      71.944     22.104   
3      3           -77.851           49.807     326.948    107.153   
4      4          -355.535          200.887     564.076    102.768   
5      5          -412.204          376.396     527.764    123.777   
6      6         -1254.029         1653.251     676.256    212.917   
7      7         -3575.659         5365.974     868.799    349.160   
8      8        -10242.787        13968.363    1275.591    493.197   

                                           R-Squared  Combined Score   
0  [-0.1165025099707262, -0.024043276932081747, -...       -0.395661  \
1  [0.37459248069314743, 0.4967831198867003, 0.50...       -0.062258   
2  [0.28668970901515944, -0.029160959055428703, -...       -1.878047   
3  [-35.818

##### 3. Identification of the Best Model: Identify the model that exhibits the highest performance based on the R-Squared and MAE metrics. Provide an explanation for choosing this specific model. 

In [40]:
# Sorting the output with 'Combined Score' descending order
sortedOutputs = output.sort_values(by=['Combined Score'], ascending=[False])

# Print the best model
print(sortedOutputs.iloc[0])

Degree                                                                 1
Mean of R-Squared                                                  0.485
Std of R-Squared                                                   0.067
Mean of MAE                                                       44.065
Std of MAE                                                         1.992
R-Squared              [0.37459248069314743, 0.4967831198867003, 0.50...
Combined Score                                                 -0.062258
Mean Absolute Error    [-44.11158757171865, -46.4793906689517, -44.70...
Name: 1, dtype: object


The model with degree 1 a it has the highest combined score.

**Combined Score = Mean(R-Squared) - Std(R-Squared) + (1 / Mean(MAE)) - (1 / Std(MAE))**

Also compared to other models, it has high R-Squared value and a lower MAE value.