# Henry Ezeanowi - 8900446
# Lab 5

1) Utilize the diabetes dataset from lab 4. Perform cross-validation on nine polynomial models, ranging from degree 0 to 8.

In [39]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.calibration import cross_val_predict
from sklearn.metrics import mean_absolute_error,r2_score

In [40]:
X, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)
degrees = range(9)
# Perform cross-validation for polynomial models of degrees 0 to 8
for degree in degrees:
    # Create a polynomial feature transformer
    polynomial_features = PolynomialFeatures(degree=degree)
    
    # Create a pipeline with polynomial feature transformation and linear regression
    model = make_pipeline(polynomial_features, LinearRegression())
    
    # Perform cross-validation
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    
    # Calculate mean absolute error
    mae = -scores.mean()
    
    # Print the degree and mean squared error
    # print(f"Degree {degree}: MAE = {mae}")

2) Construct a table summarizing the cross-validation results. Each model should have a separate row in the table. Include the R-Squared and Mean Absolute Error (MAE) metrics for each model. Calculate the mean value and standard deviation of these metrics from the cross-validation. Include both values.

In [41]:
# Create a table to summarize the cross-validation results
results= pd.DataFrame(columns=['Degree', 'R-Squared', 'MAE'])

for degree in degrees:
    # Create a pipeline with PolynomialFeatures and LinearRegression
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    
    # Calculate the R-Squared and MAE metrics
    y_pred = cross_val_predict(model, X, y, cv=5)
    r2 = r2_score(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    
    # Create a new DataFrame row with the results
    row = pd.DataFrame([[degree, r2, mae]], columns=['Degree', 'R-Squared', 'MAE'])
    
    # Concatenate the row to the results DataFrame
    results = pd.concat([results, row], ignore_index=True)


# Print the results table
print(results)

  Degree     R-Squared          MAE
0      0     -0.008824    66.039250
1      1      0.495322    44.274856
2      2     -1.140607    74.753707
3      3   -195.306514   341.371399
4      4   -572.361896   656.629284
5      5   -446.482486   563.019974
6      6  -1792.074363   743.204652
7      7  -5894.999471  1033.741782
8      8 -17172.387614  1477.455477


In [42]:
# Calculate mean and standard deviation of R-Squared and MAE
mean_r2 = results['R-Squared'].mean()
std_r2 = results['R-Squared'].std()
mean_mae = results['MAE'].mean()
std_mae = results['MAE'].std()


# Print mean and standard deviation of R-Squared and MAE
print(f"Mean of R-Squared: {mean_r2:.2f}")
print(f"Mean of MAE: {mean_mae:.2f}")
print(f"Standard Deviation of R-Squared: {std_r2:.2f}")
print(f"Standard Deviation of MAE: {std_mae:.2f}")

Mean of R-Squared: -2897.14
Mean of MAE: 555.61
Standard Deviation of R-Squared: 5677.56
Standard Deviation of MAE: 487.61


3) Identification of the Best Model: Identify the model that exhibits the highest performance based on the R-Squared and MAE metrics. Provide an explanation for choosing this specific model.

In [43]:
# Identify the model with the highest R-Squared value
best_r2_model = results.loc[results['R-Squared'].idxmax()]

# Print the model with the highest R-Squared value
print(f"The model with the highest R-Squared value:\n{best_r2_model}")

The model with the highest R-Squared value:
Degree               1
R-Squared     0.495322
MAE          44.274856
Name: 1, dtype: object


In [44]:

# Identify the model with the lowest MAE value
best_mae_model = results.loc[results['MAE'].idxmin()]

# Print the model with the lowest MAE value
print(f"The model with the lowest MAE value:\n{best_mae_model}")

The model with the lowest MAE value:
Degree               1
R-Squared     0.495322
MAE          44.274856
Name: 1, dtype: object


When we're choosing the best model, we give priority to the one that shows a strong relationship between the predictors and the target variable, which is indicated by a high R-Squared value. We also consider the model that makes more accurate predictions of the target variable, as shown by the lowest MAE value. These metrics give us important information about how well the model is performing and help us make decisions.