In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
%matplotlib inline 

import seaborn as sns
sns.set_context("poster")
sns.set_style("whitegrid")


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Data

In [3]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
dir(diabetes)

['DESCR',
 'data',
 'data_filename',
 'feature_names',
 'target',
 'target_filename']

In [16]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Brad

In [4]:
X = pd.DataFrame(diabetes.data)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [5]:
X.columns = diabetes.feature_names

In [6]:
Y = diabetes.target

## Split Data

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .30, random_state = 5)

In [8]:
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('Y_train', Y_train.shape)
print('Y_train', Y_test.shape)

X_train (309, 10)
X_test (133, 10)
Y_train (309,)
Y_train (133,)


# Fit the Model

In [10]:
lm = LinearRegression()

In [11]:
lm.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [12]:
y_pred = lm.predict(X_test)

# Metrics

In [17]:
lm.score(X,Y)

0.5140694502468626

In [13]:
# Regression metrics
explained_variance=metrics.explained_variance_score(Y_test, y_pred)
mean_absolute_error=metrics.mean_absolute_error(Y_test, y_pred) 
mse=metrics.mean_squared_error(Y_test, y_pred) 
median_absolute_error=metrics.median_absolute_error(Y_test, y_pred)
r2=metrics.r2_score(Y_test, y_pred)

In [14]:
print('explained_variance: ', round(explained_variance,4))    
print('r2: ', round(r2,4))
print('MAE: ', round(mean_absolute_error,4))
print('MSE: ', round(mse,4))
print('RMSE: ', round(np.sqrt(mse),4))

explained_variance:  0.5234
r2:  0.5209
MAE:  45.4797
MSE:  3121.9699
RMSE:  55.8746


# New Prediction

In [40]:
new_obs = pd.DataFrame(np.array([[0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14]]),columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD','TAX', 'PTRATIO', 'B', 'LSTAT'])

In [42]:
new_pred = lm.predict(new_obs)

In [43]:
print(new_pred)

[24.94240982]
