Importing the necessary packages


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

 Loading the diabetes data

In [2]:
from sklearn import datasets
diabetes_X,diabetes_y = datasets.load_diabetes(return_X_y=True)

Converting the data into one single dataframe

In [3]:
diabetes = pd.DataFrame(diabetes_X,columns = ["age","sex","bmi","bp","s1","s2","s3","s4","s5","s6"])
diabetes["dis_prog"] = pd.DataFrame(diabetes_y)

Splitting into indepndent and dependent variable

In [4]:
X = diabetes.iloc[:,:-1]
y = diabetes.iloc[:,-1]

Fitting the model

In [5]:
# Creating a list for storing all the 9 degress, R squared, and MAE 
degrees = []
list_of_r2_scores = []
list_of_mae_scores = []

# Creating a range of 9 polynomial degrees with for loop
for degree in range(9):
    # Create polynomial features
    poly_features = PolynomialFeatures(degree=degree)
    X_poly = poly_features.fit_transform(X)
    
    # Performing cross validation with 10 splits on linear regression model
    model = LinearRegression()
    r_squared = cross_val_score(model, X_poly, y, cv=10, scoring='r2')
    mean_absolute_error = -cross_val_score(model, X_poly, y, cv=10, scoring='neg_mean_absolute_error')
    
    # Appending the model results
    degrees.append(degree)
    list_of_r2_scores.append(np.mean(r_squared))
    list_of_mae_scores.append(np.mean(mean_absolute_error))

# Creating a dataframe consisting the degress, R2 and MAE
results_df = pd.DataFrame({'Degree': degrees,
                           'R-Squared': list_of_r2_scores,
                           'MAE': list_of_mae_scores})

# Calculate the mean and standard deviation of the R_squared and Mean absolute error
mean_r_squared = results_df['R-Squared'].mean()
standard_dev_r_squared = results_df['R-Squared'].std()
mean_mean_absolute_error = results_df['MAE'].mean()
standard_deviation_mean_absolute_error = results_df['MAE'].std()

# Print the table of cross-validation results
print(results_df)

# Print the mean and standard deviation of the metrics
print(f"\nMean R-Squared: {mean_r_squared:.4f}")
print(f"Standard Deviation of R-Squared: {standard_dev_r_squared:.4f}")
print(f"Mean MAE: {mean_mean_absolute_error:.4f}")
print(f"Standard Deviation of MAE: {standard_deviation_mean_absolute_error:.4f}")

# Identifying the best model with the help of idmax and idmin
# Higher R-squared values indicate a better fit of the regression model to the data. 
# MAE calculates the average absolute difference between the predicted values and the actual values
# Lower the MAE better the model

r_squared_best_model = results_df['R-Squared'].idxmax()
mean_absolute_error_best_model = results_df['MAE'].idxmin()

print("\nBest Model based on R-Squared:")
print(results_df.loc[r_squared_best_model])

print("\nBest Model based on MAE:")
print(results_df.loc[mean_absolute_error_best_model])

   Degree  R-Squared         MAE
0       0  -0.039767   65.948459
1       1   0.461960   44.223084
2       2   0.379861   45.873376
3       3 -38.703138  216.637639
4       4 -72.819234  329.077072
5       5 -65.606865  312.433209
6       6 -65.480725  312.239540
7       7 -65.479136  312.236473
8       8 -65.478698  312.235972

Mean R-Squared: -41.4184
Standard Deviation of R-Squared: 32.6507
Mean MAE: 216.7672
Standard Deviation of MAE: 127.8826

Best Model based on R-Squared:
Degree        1.000000
R-Squared     0.461960
MAE          44.223084
Name: 1, dtype: float64

Best Model based on MAE:
Degree        1.000000
R-Squared     0.461960
MAE          44.223084
Name: 1, dtype: float64


Inferences

R-squared provides an indication of how well the regression model fits the above data. The value of R squared
ranges from 0 to 1. Higher the R-squared values indicate a better fit of the model to the data. 

Mean absolute error is used to predict the absolute difference of the actaul vs the predicted value of the model. A lower Mean absolute error suggests that the lower MAE indicates that the model is close to the actual value and is better at predicting new data (unseen data). 

From the above observation it is clearly interpretable that the model with degree = 1 is the best model as
it has the highest R2 score and lowest MAE