In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns 
sns.set_context("poster")
sns.set_style("whitegrid")

import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

In [2]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
dir(diabetes)

['DESCR',
 'data',
 'data_filename',
 'feature_names',
 'target',
 'target_filename']

In [3]:
data = pd.DataFrame(diabetes.data)
data.columns = diabetes.feature_names
data['baseline'] = diabetes.target
data.head(2)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,baseline
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0


In [4]:
X = data.iloc[:,:-1]
X.head(2)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204


In [5]:
Y = data["baseline"]
Y.head(2)

0    151.0
1     75.0
Name: baseline, dtype: float64

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 5)
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('Y_train',Y_train.shape)
print('Y_test',Y_test.shape)

X_train (309, 10)
X_test (133, 10)
Y_train (309,)
Y_test (133,)


In [7]:
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, Y_train)  

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [8]:
y_pred = reg.predict(X_test)

In [9]:
# Regression metrics
explained_variance=metrics.explained_variance_score(Y_test, y_pred) # The best possible score is 1.0, lower values are worse.
mean_absolute_error=metrics.mean_absolute_error(Y_test, y_pred) #a risk metric corresponding to the expected 
# value of the absolute error loss
mse=metrics.mean_squared_error(Y_test, y_pred) # a risk metric corresponding to the expected 
#value of the squared (quadratic) error or loss
median_absolute_error=metrics.median_absolute_error(Y_test, y_pred)
r2=metrics.r2_score(Y_test, y_pred)

In [10]:
print('explained_variance: ', round(explained_variance,4))    
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))

explained_variance:  0.5056
r2:  0.5039
MAE:  47.2981
MSE:  3233.0042
RMSE:  56.8595
