In [1]:
from sklearn import datasets
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


Utilize the diabetes dataset from lab 4

In [2]:
X, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)

degrees = range(9)
resultDF = pd.DataFrame(columns=['Degree', 'R2', 'R2 Mean', 'R2 SD', 'MAE', 'MAE Mean', 'MAE SD'])

In [3]:
def evaluateModel(model):
    maeScore = cross_val_score(model,X,y,cv=3,scoring='neg_mean_absolute_error')
    r2Score = cross_val_score(model,X,y,cv=3,scoring='r2')
    return r2Score, maeScore

Perform cross-validation on nine polynomial models, ranging from degree 0 to 8 and construct a table summarizing the cross-validation results

In [4]:
for degree in degrees:
    model = make_pipeline(PolynomialFeatures(degree),LinearRegression())
    r2Score, maeScore = evaluateModel(model)
    resultDF.loc[degree] = ({'Degree': degree, 
                             'R2': np.round(r2Score, decimals=3), 
                             'R2 Mean': r2Score.mean(), 
                             'R2 SD': r2Score.std(), 
                             'MAE': np.round(maeScore, decimals=3),
                             'MAE Mean': maeScore.mean(), 
                             'MAE SD': maeScore.std()})
resultDF

Unnamed: 0,Degree,R2,R2 Mean,R2 SD,MAE,MAE Mean,MAE SD
0,0,"[-0.007, -0.0, -0.006]",-0.004217,0.002981,"[-63.324, -68.759, -65.485]",-65.855977,2.234298
1,1,"[0.469, 0.487, 0.51]",0.488702,0.016462,"[-45.412, -46.796, -41.358]",-44.521891,2.307506
2,2,"[0.402, 0.034, -0.511]",-0.024996,0.375276,"[-45.791, -63.991, -70.724]",-60.168526,10.531528
3,3,"[-113.412, -155.192, -265.096]",-177.900059,63.9726,"[-437.538, -536.786, -562.402]",-512.242061,53.848611
4,4,"[-140.415, -175.52, -219.524]",-178.486711,32.364115,"[-476.272, -481.032, -522.811]",-493.371673,20.907164
5,5,"[-180.199, -102.973, -651.312]",-311.494718,242.346724,"[-502.091, -460.142, -649.764]",-537.332382,81.32445
6,6,"[-307.207, -621.361, -1362.142]",-763.56985,442.259158,"[-621.263, -671.224, -818.827]",-703.771626,83.874451
7,7,"[-468.981, -3411.251, -2573.184]",-2151.138439,1237.694181,"[-745.478, -1006.357, -1066.479]",-939.437743,139.32916
8,8,"[-979.719, -13764.009, -4940.148]",-6561.292117,5343.569269,"[-973.933, -1608.446, -1425.975]",-1336.11802,266.717508


Identification of the Best Model(s). This will output a df with 1 or 2 rows. If only one then that is the model with the highest $R^2$ and MAE values.<br>
If there are 2 then the first row will be the model with the best $R^2$ value and the second row is the model with the best MAE vale.

In [5]:
bestR2 = resultDF['R2 Mean'].idxmax()
bestMAE = resultDF['MAE Mean'].idxmax()

bestDF = resultDF[(resultDF['Degree'] == bestR2) | (resultDF['Degree'] == bestMAE)]
bestDF

Unnamed: 0,Degree,R2,R2 Mean,R2 SD,MAE,MAE Mean,MAE SD
1,1,"[0.469, 0.487, 0.51]",0.488702,0.016462,"[-45.412, -46.796, -41.358]",-44.521891,2.307506
