## Example 2: Linear Regression with Diabetes Data

In [8]:
# Import the required libraries
import numpy as np
from sklearn import datasets

#Import the learning algorithm
from sklearn.linear_model import LinearRegression

# Load the dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)


### Step 1,2: Data Preprocessing, Feature Engineering

In [2]:
# No need for Preprocessing and any feature selection

### Step 3: Train/Test Data Splitting

In [3]:
# Split iris data in train and test data
# A random permutation, to split the data randomly
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test  = diabetes_X[-20:]
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test  = diabetes_y[-20:]

### Step 4: Model Creation and Training

In [4]:
# Create and fit a linear regression model and predict the target values
regr = LinearRegression()


# Model training with fit function
regr.fit(diabetes_X_train, diabetes_y_train)


# Result with the trained model
regr.predict(diabetes_X_test)



array([197.61846908, 155.43979328, 172.88665147, 111.53537279,
       164.80054784, 131.06954875, 259.12237761, 100.47935157,
       117.0601052 , 124.30503555, 218.36632793,  61.19831284,
       132.25046751, 120.3332925 ,  52.54458691, 194.03798088,
       102.57139702, 123.56604987, 211.0346317 ,  52.60335674])

### Step 5: Model Evaluation

In [5]:
# The mean square error
np.mean((regr.predict(diabetes_X_test) - diabetes_y_test)**2)

# Explained variance score: 1 is perfect prediction
# and 0 means that there is no linear relationship
# between X and y.
print(regr.coef_) 
regr.score(diabetes_X_test, diabetes_y_test)

[ 3.03499549e-01 -2.37639315e+02  5.10530605e+02  3.27736980e+02
 -8.14131709e+02  4.92814588e+02  1.02848452e+02  1.84606489e+02
  7.43519617e+02  7.60951722e+01]


0.5850753022690574

### Advanced Analysis for Model Selection and Evaluation 

#### Cross Validation 

In [10]:
# Import CV from model selection section
from sklearn.model_selection import cross_val_score


#A new KNN model for CV
regr_CV = LinearRegression()


#train model with cv of 5 
cv_scores = cross_val_score(regr_CV, diabetes_X, diabetes_y, cv=10)


#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.55614411 0.23056092 0.35357777 0.62190498 0.26587602 0.61819338
 0.41815916 0.43515232 0.43436983 0.68568514]
cv_scores mean:0.4619623619583371


#### Tuning model Parameters using GridSearch

In [13]:
from sklearn.model_selection import GridSearchCV


# CReate a new KNN for GS

diab_LR = LinearRegression()

# Create a model parameter list 
param_LR = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}


kNN_GS = GridSearchCV(diab_LR, param_LR, cv=10)

kNN_GS.fit(diabetes_X,diabetes_y)

## Find the best parameters for KNN
print(kNN_GS.best_params_)

## Find the best score with the best parameters
kNN_GS.best_score_

{'copy_X': True, 'fit_intercept': True, 'normalize': True}




0.46165191016428014