In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from sklearn.datasets import load_diabetes

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

import warnings
warnings.filterwarnings("ignore")

In [34]:
data = load_diabetes()

In [35]:
print(f'X Shape : {data.data.shape}')
print(f'Y Shape : {data.target.shape}')

X Shape : (442, 10)
Y Shape : (442,)


In [36]:
print(data['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [37]:
pd.DataFrame(data['data'], columns=data['feature_names']).describe().map(lambda x: f"{x:0.2f}")

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0
std,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
min,-0.11,-0.04,-0.09,-0.11,-0.13,-0.12,-0.1,-0.08,-0.13,-0.14
25%,-0.04,-0.04,-0.03,-0.04,-0.03,-0.03,-0.04,-0.04,-0.03,-0.03
50%,0.01,-0.04,-0.01,-0.01,-0.0,-0.0,-0.01,-0.0,-0.0,-0.0
75%,0.04,0.05,0.03,0.04,0.03,0.03,0.03,0.03,0.03,0.03
max,0.11,0.05,0.17,0.13,0.15,0.2,0.18,0.19,0.13,0.14


In [38]:
Y = pd.DataFrame(data['target'], columns=['Y'])
X = pd.DataFrame(data['data'], columns=data['feature_names'])
# X = pd.get_dummies(X, columns=['sex'])

In [51]:
data = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')

In [53]:
Y = data['Y']
X = data.drop(columns=['Y']) 
X = pd.get_dummies(X, columns=['SEX'])

In [55]:
idx = list(range(X.shape[0]))
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=0)

In [61]:
results = LinearRegression().fit(X.iloc[train_idx], Y.iloc[train_idx])
results

In [147]:
import scipy
from sklearn import metrics

def sse(clf, X, y):
    """
    standard squared error(residual)
    
    Params
    ----
    clf : linear model
    X : training data
    y : target value
    
    return
    ----
    float
    """
    y_hat = clf.predict(X)
    sse = np.sum((y_hat - y)**2)
    return sse/X.shape[0]

def adj_r2_score(clf, X, y):
    """
    adjusted r2 score
    
    Params
    ----
    clf : linear model
    X : training data
    y : target value
    
    return
    ----
    float
    """
    n = X.shape[0]
    p = X.shape[1]
    y_hat = clf.predict(X)
    r_squared = metrics.r2_score(y, y_hat)
    return 1-(1-r_squared)*(n-1)/(n-p-1)

def coef_se(clf, X, y):
    """
    standard error of beta coefficients
    
    Params
    ----
    clf : linear model
    X : training data
    y : target value
    
    return
    ----
    float
    """
    n = X.shape[0]
    X = X.map(lambda x: float(x))
    X1 = np.hstack((np.ones((n, 1)), np.matrix(X)))
    print(X1)
    se_matrix = scipy.linalg.sqrtm(
        metrics.mean_squared_error(y, clf.predict(X))* np.linalg.inv(X1.T * X1)
    )
    return np.diagonal(se_matrix)
    

In [148]:
coef_se(results, X, Y)

[[ 1.  59.  32.1 ... 87.   0.   1. ]
 [ 1.  48.  21.6 ... 69.   1.   0. ]
 [ 1.  72.  30.5 ... 85.   0.   1. ]
 ...
 [ 1.  60.  24.9 ... 95.   0.   1. ]
 [ 1.  36.  30.  ... 85.   1.   0. ]
 [ 1.  36.  19.6 ... 92.   1.   0. ]]


LinAlgError: Singular matrix