In [12]:
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd

In [4]:
# Load dataset from scikit-learn dataset library
# diabetes_X -> Features 
# diabetes_Y -> Labels
diabetes = datasets.load_diabetes()
print('Dataset shape:',diabetes.data.shape)
print('Diabetes labels shape:',diabetes.target.shape)
print('Diabetes feature names:',diabetes.feature_names)

Dataset shape: (442, 10)
Diabetes labels shape: (442,)
Diabetes feature names: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2, random_state=1)

In [10]:
linReg = LinearRegression()

# Fit function is used to train out model on training set
linReg.fit(X_train, y_train)

LinearRegression()

In [11]:
# Let's use our trained model to predict on new unseen data
# ie; test dataset.
y_preds = linReg.predict(X_test) # It returns predicted labels.

# Now, Evaluate our model -> Squared Mean Error
from sklearn.metrics import mean_squared_error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_preds))

Mean squared error: 2992.56


In [13]:
# Compare Actual Labels to the predicted labels
df_result = pd.DataFrame({'Actual': y_test, 'Predicted': y_preds})
df_result

Unnamed: 0,Actual,Predicted
0,78.0,119.138000
1,152.0,110.773913
2,200.0,185.037535
3,59.0,68.010043
4,311.0,171.055675
...,...,...
84,64.0,113.351181
85,107.0,111.447774
86,49.0,98.357841
87,60.0,73.104863


In [15]:
cdf = pd.DataFrame(linReg.coef_, diabetes.feature_names, columns=['Coefficients'])
print(cdf)

     Coefficients
age    -30.621682
sex   -272.254517
bmi    528.844443
bp     327.702690
s1    -581.014130
s2     332.962863
s3     -27.976062
s4     139.284490
s5     665.075210
s6      61.905964
