# **Practical Lab-5 Dwarakanath Chandra (8856840)**

### **Question-1**

#### **Utilize the diabetes dataset from lab 4. Perform cross-validation on nine polynomial models, ranging from degree 0 to 8.**

In [10]:
# Importing the required libraries and packages

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import sklearn
from typing import List
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

In [11]:
# Importing the Diabetes dataset

X, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)

In [12]:
# Defining a polynomial model creation function with varying degrees

def create_polynomial_models(X: np.ndarray, y: np.ndarray, degrees: List[int]) -> dict[int, np.poly1d]:
    """
    Creates polynomial models for the given degrees and fits them to the given data.

    Args
        X: The x values of the data.
        y: The y values of the data.
        degrees: A list of polynomial degrees to try.

    Returns:
        A dictionary of polynomial models, with the polynomial degree as the key.
    """

    models = {}
    for degree in degrees:
        model = Pipeline([('polynomial', PolynomialFeatures(degree=degree)),
                          ('linear', LinearRegression())])
        model.fit(X, y)
        models[degree] = model
    return models

In [13]:
# Creating the Polynomial Models ranging degree from 0 to 8

degrees = list(range(0, 9))

models = create_polynomial_models(X, y, degrees)

models

{0: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=0)),
                 ('linear', LinearRegression())]),
 1: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
                 ('linear', LinearRegression())]),
 2: Pipeline(steps=[('polynomial', PolynomialFeatures()),
                 ('linear', LinearRegression())]),
 3: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=3)),
                 ('linear', LinearRegression())]),
 4: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=4)),
                 ('linear', LinearRegression())]),
 5: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=5)),
                 ('linear', LinearRegression())]),
 6: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=6)),
                 ('linear', LinearRegression())]),
 7: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=7)),
                 ('linear', LinearRegression())]),
 8: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=8)),
      

In [14]:
def print_pipeline_model_stats(model):
    # print model    
    print(f'Model: {model}')
    
    if isinstance(model[-1].coef_, (list, np.ndarray)):
        if len(model[-1].coef_) > 0:
            if isinstance(model[-1].coef_[0], (list, np.ndarray)):
                print(f'Coefficients: {model[-1].coef_[0][1:]}')
            else:
                print(f'Coefficients: {model[-1].coef_[1:]}')
        else:
            print('No coefficients found.')
    else:
        print(f'Coefficients: {model[-1].coef_}')
    
    if isinstance(model[-1].intercept_, (list, np.ndarray)):
        if len(model[-1].intercept_) > 0:
            intercept = model[-1].intercept_[0]
            print(f'Intercept: {intercept}')
        else:
            print('No intercept found.')
    else:
        intercept = model[-1].intercept_
        print(f'Intercept: {intercept}')
    
    # generate equation string:
    if isinstance(intercept, (int, float)):
        equation = f"y = {intercept:.2f}"
    else:
        equation = "y = "
    
    if isinstance(model[-1].coef_, (list, np.ndarray)):
        if len(model[-1].coef_) > 0:
            if isinstance(model[-1].coef_[0], (list, np.ndarray)):
                coefficients = model[-1].coef_[0][1:]
            else:
                coefficients = model[-1].coef_[1:]
            
            for ind, coeff in enumerate(coefficients):
                degree = ind + 1
                equation += f" + {coeff:.2f}x^{degree}"
    print(f'Equation: {equation}')
    print('***')

In [15]:
# print models and their coefficients

#for degree, model in models.items():
    #print_pipeline_model_stats(model)


### **Question-2**

#### **Construct a table summarizing the cross-validation results. Each model should have a separate row in the table. Include the R-Squared and Mean Absolute Error (MAE) metrics for each model. Calculate the mean value and standard deviation of these metrics from the cross-validation. Include both values.**

In [16]:
# Create an empty DataFrame to store the results
results = pd.DataFrame(columns=['Degree', 'R-Squared', 'MAE'])

for degree, model in models.items():
    # Perform cross-validation
    cv_results = cross_validate(model, X, y, cv=5, scoring=['r2', 'neg_mean_absolute_error'])
    
    # Calculate mean and standard deviation of the metrics
    r2_mean = np.mean(cv_results['test_r2'])
    r2_std = np.std(cv_results['test_r2'])
    mae_mean = -np.mean(cv_results['test_neg_mean_absolute_error'])
    mae_std = np.std(cv_results['test_neg_mean_absolute_error'])
    
    # Add the results to the DataFrame
    new_row = pd.DataFrame({
        'Degree': [degree],
        'R-Squared': [f"{r2_mean:.3f} ± {r2_std:.3f}"],
        'R-Squared_mean': [f"{r2_mean:.3f}"],
        'R-Squared_std': [f"{r2_std:.3f}"],
        'MAE': [f"{mae_mean:.3f} ± {mae_std:.3f}"],
        'MAE_mean':[f"{mae_mean:.3f}"],
        'MAE_std':[f"{mae_std:.3f}"]
    })
    results = pd.concat([results, new_row], ignore_index=True)

# Converting results dataframe to a table
results_table = results.to_string(index=False)

# Print the table
print(results_table)

Degree              R-Squared                MAE R-Squared_mean R-Squared_std MAE_mean MAE_std
     0         -0.028 ± 0.037     66.046 ± 3.475         -0.028         0.037   66.046   3.475
     1          0.482 ± 0.049     44.276 ± 2.100          0.482         0.049   44.276   2.100
     2         -1.485 ± 1.618    80.595 ± 24.657         -1.485         1.618   80.595  24.657
     3     -203.419 ± 225.878  342.052 ± 142.438       -203.419       225.878  342.052 142.438
     4     -571.083 ± 369.892  657.260 ± 159.476       -571.083       369.892  657.260 159.476
     5     -436.857 ± 379.100   562.994 ± 59.917       -436.857       379.100  562.994  59.917
     6   -1694.550 ± 2629.990  742.717 ± 190.709      -1694.550      2629.990  742.717 190.709
     7   -5530.894 ± 9518.587 1032.682 ± 393.440      -5530.894      9518.587 1032.682 393.440
     8 -16076.255 ± 28049.953 1475.659 ± 706.280     -16076.255     28049.953 1475.659 706.280


### **Question-3**

#### **Identification of the Best Model: Identify the model that exhibits the highest performance based on the R-Squared and MAE metrics. Provide an explanation for choosing this specific model.**

##### **Currently, there are 9 models of varying polynomial degrees ranging from 0 to 8. To identify the best model among the 9 models, we have already performed the cross-validation based on scores of R_Squared and Negative Mean Absolute Error (MAE) and claculated the mean and standard deviation for these performance metrics as shown above.**

##### **In order to select the best model among the 9 polynomial models, we need to identify the model of certain degree with higher R_Squared value and lower MAE value compared to other models. The higher the R_Squared value, the greater the amount of variation in data explained. The lower the MAE value, the smaller the deviation between predicted value and actual values of the models.**

##### **Hence, the objective of finding the best model is to explore a model of certain degree with higher R_Squared and lower MAE.**

In [17]:
# Sort the results DataFrame by R-Squared in descending order and MAE in ascending order
sorted_results = results.sort_values(by=['R-Squared', 'MAE'], ascending=[False, True])

# Get the best model (first row in the sorted DataFrame)
best_model = sorted_results.iloc[0]

# Print the best model
print(best_model)

Degree                         1
R-Squared          0.482 ± 0.049
MAE               44.276 ± 2.100
R-Squared_mean             0.482
R-Squared_std              0.049
MAE_mean                  44.276
MAE_std                    2.100
Name: 1, dtype: object


In [18]:
# Update the code to handle the 'R-Squared' and 'MAE' columns correctly
results['R-Squared'] = results['R-Squared'].apply(lambda x: float(x.split(' ±')[0]) if isinstance(x, str) else x)
results['MAE'] = results['MAE'].apply(lambda x: float(x.split(' ±')[0]) if isinstance(x, str) else x)

# Find the index of the model with the highest R-squared
best_r2_index = results['R-Squared'].idxmax()

# Find the index of the model with the lowest MAE
best_mae_index = results['MAE'].idxmin()

# Get the degree of the model with the highest R-squared
best_r2_degree = results.loc[best_r2_index, 'Degree']

# Get the degree of the model with the lowest MAE
best_mae_degree = results.loc[best_mae_index, 'Degree']

# Print the best models based on R-squared and MAE
print("Best model based on R-squared: Degree", best_r2_degree)
print("Best model based on MAE: Degree", best_mae_degree)
print(results[results["Degree"]==1])


Best model based on R-squared: Degree 1
Best model based on MAE: Degree 1
  Degree  R-Squared     MAE R-Squared_mean R-Squared_std MAE_mean MAE_std
1      1      0.482  44.276          0.482         0.049   44.276   2.100


##### **Hence, the selected best model is the linear model of degree-1 with 10 features because, it has high R_Squared value (0.482) and lower MAE value (44.276) compared to all other polynomial models.**

##### **Though R_Squared value of 0.482 indicates average prediction performance, that is the best among all among polynomial models. AS the degree of polynomial models increasing, the model performance is dropping significantly due to the overfitting and noise due to introduction of interaction terms**