# Use diabetes dataset (sklearn.datasets.load_diabetes) and apply

- Ridge
- Lasso
- Polynomial

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd


In [None]:
data = load_diabetes()
x = data.data
y = data.target


In [None]:
print(data.DESCR)


.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [None]:
data.feature_names


['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [None]:
list(data.keys())


['data',
 'target',
 'frame',
 'DESCR',
 'feature_names',
 'data_filename',
 'target_filename',
 'data_module']

In [None]:
x.shape, y.shape


((442, 10), (442,))

---

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)


In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge())
])


In [None]:
params = {
    "ridge__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
}


In [None]:
search = GridSearchCV(
    pipeline,
    param_grid=params,
    cv=5,  # фолди, золотий стандарт
    scoring="r2",
    refit=True,
    n_jobs=-1  # паралельність обчислення
)


In [None]:
search.fit(x_train, y_train)


In [None]:
best_pipe = search.best_estimator_
print(best_pipe)


Pipeline(steps=[('scaler', StandardScaler()), ('ridge', Ridge(alpha=10.0))])


In [None]:
y_pred = best_pipe.predict(x_test)


In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2_test = best_pipe.score(x_test, y_test)  # r2 test
r2_train = best_pipe.score(x_train, y_train)  # r2 score

print(f"Test MSE: {mse:.3f}")
print(f"Test RMSE: {rmse:.3f}")

print(f"R2 Score test: {r2_test:.3f}")
print(f"R2 Score train: {r2_train:.3f}")


Test MSE: 2875.779
Test RMSE: 53.626
R2 Score test: 0.457
R2 Score train: 0.525


## Lasso


In [None]:
pipeline1 = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", Lasso(max_iter=5000))
])


In [None]:
params1 = {
    "lasso__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10],
    "lasso__tol": [1e-3, 1e-4, 1e-5],
    "lasso__selection": ["cyclic", "random"]
}


In [None]:
search1 = GridSearchCV(
    pipeline1,
    param_grid=params1,
    cv=5,
    scoring="r2",
    refit=True,
    n_jobs=-1
)


In [None]:
search1.fit(x_train, y_train)


In [None]:
best_pipe1 = search1.best_estimator_
print(best_pipe1)


Pipeline(steps=[('scaler', StandardScaler()),
                ('lasso',
                 Lasso(alpha=1, max_iter=5000, selection='random', tol=0.001))])


In [None]:
y_pred1 = best_pipe1.predict(x_test)


In [None]:
mse1 = mean_squared_error(y_test, y_pred1)
rmse1 = np.sqrt(mse1)
score_test = best_pipe1.score(x_test, y_test)
score_train = best_pipe1.score(x_train, y_train)
print("--LASSO--")
print(f"MSE: {mse1:.3f}")
print(f"RMSE: {rmse1:.3f}")
print(f"R2 test: {score_test:.3f}")
print(f"R2 train: {score_train:.3f}")


--LASSO--
MSE: 2824.285
RMSE: 53.144
R2 test: 0.467
R2 train: 0.522


# Polynomial

In [None]:
pipeline2 = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures()),
    ("linreg", LinearRegression())
])

params2 = {
    "poly__degree": [0, 1, 2, 3, 4, 5]
}

search2 = GridSearchCV(
    pipeline2,
    param_grid=params2,
    cv=5,
    scoring="r2",
    refit=True,
    n_jobs=-1
)


In [None]:
search2.fit(x_train, y_train)


In [None]:
best_pipe2 = search2.best_estimator_
print(best_pipe2)


Pipeline(steps=[('scaler', StandardScaler()),
                ('poly', PolynomialFeatures(degree=1)),
                ('linreg', LinearRegression())])


In [None]:
y_pred2 = best_pipe2.predict(x_test)


In [None]:
print("--POLYNOMIAL--")
print("MSE: ", round(mean_squared_error(y_test, y_pred2), 3))
print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred2)), 3))

print("R2 test: ", round(best_pipe2.score(x_test, y_test), 3))
print("R2 train score:", round(best_pipe2.score(x_train, y_train)))


--POLYNOMIAL--
MSE:  2900.194
RMSE:  53.853
R2 test:  0.453
R2 train score: 1


Overfit ✌🏻