In [8]:
import pandas as pd

df = pd.read_csv('data/healthcare_dataset.csv')

In [9]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np

#print("Shape:", df.shape)
#print(df.head())

y = df['sysBP'].to_numpy()
X = df.drop(columns=['sysBP']).to_numpy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=5)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

scaler1 = StandardScaler()
X_train_std = scaler1.fit_transform(X_train)
X_val_std = scaler1.transform(X_val)

lin1 = LinearRegression()
lin1.fit(X_train_std, y_train)

y_training_preds_1 = lin1.predict(X_train_std)
y_validation_preds_1 = lin1.predict(X_val_std)

print("Linear RMSE train:", rmse(y_train, y_training_preds_1))
print("Linear RMSE valid:", rmse(y_val, y_validation_preds_1))

poly = PolynomialFeatures(degree=5, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)

scaler2 = StandardScaler()
X_train_poly_std = scaler2.fit_transform(X_train_poly)
X_val_poly_std = scaler2.transform(X_val_poly)

lin2 = LinearRegression()
lin2.fit(X_train_poly_std, y_train)

y_training_preds_2 = lin2.predict(X_train_poly_std)
y_validation_preds_2 = lin2.predict(X_val_poly_std)

print("Poly d=5 + Linear RMSE train:", rmse(y_train, y_training_preds_2))
print("Poly d=5 + Linear RMSE valid:", rmse(y_val, y_validation_preds_2))

alphas = [10.0**p for p in range(-2, 11)]

for a in alphas:
    ridge = Ridge(alpha=a, fit_intercept=True, max_iter=100000)
    ridge.fit(X_train_poly_std, y_train)

    y_training_preds_ridge = ridge.predict(X_train_poly_std)
    y_validation_preds_ridge = ridge.predict(X_val_poly_std)

    print(f"Ridge alpha={a:.0e}")
    print("RMSE train:", rmse(y_train, y_training_preds_ridge))
    print("RMSE valid:", rmse(y_val, y_validation_preds_ridge))
    

Linear RMSE train: 11.242641619317617
Linear RMSE valid: 11.779985042529283
Poly d=5 + Linear RMSE train: 4.1787307567455945e-10
Poly d=5 + Linear RMSE valid: 4715.300130841895
Ridge alpha=1e-02
RMSE train: 4.865723180005279
RMSE valid: 80.6207070367336
Ridge alpha=1e-01
RMSE train: 5.991875447697568
RMSE valid: 62.54015234559069
Ridge alpha=1e+00
RMSE train: 7.098352339575401
RMSE valid: 27.937437946703813
Ridge alpha=1e+01
RMSE train: 8.087666799083845
RMSE valid: 16.907193418624992
Ridge alpha=1e+02
RMSE train: 8.94684643062267
RMSE valid: 14.196148475359383
Ridge alpha=1e+03
RMSE train: 9.675838723999725
RMSE valid: 14.523976230366204
Ridge alpha=1e+04
RMSE train: 10.45173942908914
RMSE valid: 13.072742005153927
Ridge alpha=1e+05
RMSE train: 11.70854637905468
RMSE valid: 12.78631186860536
Ridge alpha=1e+06
RMSE train: 14.597573500704904
RMSE valid: 15.740787030859412
Ridge alpha=1e+07
RMSE train: 19.054428062817028
RMSE valid: 20.296412040631143
Ridge alpha=1e+08
RMSE train: 21.397