# Diabetes Progression Prediction using DecisionTree Regressor

#### Load the Diabetes prediction dataset. Divide the data into training and testing set. Print the training and testing data. Create a Decision Tree Regression model for the dataset and run the model for 500 iterations. Apply Linear Regression to the dataset and print the results. Print the evaluation results and learned model weights. Compare the results with results of Linear regression in terms of speed and accuracy.

### Load Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn import metrics
import timeit

#### Load input features and target

In [2]:
diabetes = datasets.load_diabetes()

In [3]:
print(diabetes.feature_names)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


#### Display values of 1st sample

In [4]:
print(diabetes.data[:1])  #it seems data are normalized

[[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
  -0.04340085 -0.00259226  0.01990842 -0.01764613]]


In [5]:
#Uncomment the following line to see values
print(diabetes.target[:10])

[151.  75. 141. 206. 135.  97. 138.  63. 110. 310.]


In [6]:
print(diabetes.data.shape)

(442, 10)


In [7]:
print(diabetes.target.shape)

(442,)


#### Split dataset into training set and test set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, 
                                                    diabetes.target, 
                                                    test_size=0.2, 
                                                    random_state=0)

#### Print shapes

In [9]:
print(X_train.shape)

(353, 10)


In [10]:
print(y_train.shape)

(353,)


In [11]:
print(X_test.shape)

(89, 10)


In [12]:
print(y_test.shape)

(89,)


#### Display some values

In [13]:
print(X_train[:1])

[[ 0.01264814  0.05068012  0.00241654  0.05630106  0.02732605  0.01716188
   0.04127682 -0.03949338  0.00371174  0.07348023]]


In [14]:
print(y_train[:5])

[ 85. 137.  53.  51. 197.]


#### Create DecisionTreeRegressor model

In [21]:
model = DecisionTreeRegressor(max_depth=4, min_samples_split=5,
                              max_leaf_nodes=10)

#### Train the model and predict

In [22]:
start = timeit.timeit()

In [23]:
model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=4,
                      max_features=None, max_leaf_nodes=10,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [24]:
end = timeit.timeit()

In [28]:
print('Training time - DecisionTreeRegressor: {:0.3f} seconds'.format(end-start))

Training time - DecisionTreeRegressor: 0.001 seconds


#### Predict labels for test data

In [29]:
y_pred = model.predict(X_test)
y_pred

array([233.05      , 263.22857143, 233.05      , 125.09210526,
        85.        , 263.22857143,  86.20512821, 225.1875    ,
       147.225     , 225.1875    , 167.42857143, 147.225     ,
       125.09210526,  86.25      , 263.22857143,  86.25      ,
       125.09210526,  86.20512821,  86.25      , 233.05      ,
       233.05      , 147.225     , 167.42857143, 147.225     ,
       225.1875    , 225.1875    ,  86.20512821,  86.20512821,
       167.42857143, 147.225     , 261.25      ,  86.25      ,
       125.09210526, 125.09210526, 167.42857143, 147.225     ,
       125.09210526, 125.09210526,  86.20512821, 225.1875    ,
       125.09210526, 125.09210526, 167.42857143, 167.42857143,
       233.05      ,  86.20512821,  86.20512821, 147.225     ,
       125.09210526, 225.1875    , 125.09210526,  86.25      ,
        85.        , 167.42857143, 225.1875    , 261.25      ,
       225.1875    ,  86.20512821, 125.09210526, 167.42857143,
       225.1875    , 147.225     , 147.225     , 125.09

#### Just compare one prediction value

In [30]:
print(y_test[:10])

[321. 215. 127.  64. 175. 275. 179. 232. 142.  99.]


In [31]:
print(y_pred[:10])

[233.05       263.22857143 233.05       125.09210526  85.
 263.22857143  86.20512821 225.1875     147.225      225.1875    ]


#### Results of DecisionTreeRegressor model

In [23]:
print('R-squared value on training dataset: %0.3f' %model.score(X_train, y_train))

R-squared value on training dataset: 0.602


In [24]:
print('R-squared value on test dataset: %0.3f' %model.score(X_test, y_test))

R-squared value on test dataset: 0.022


In [25]:
print('Mean Absolute Error: %0.3f' %metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error: %0.3f' %metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error: %0.3f' %np.sqrt(metrics.mean_squared_error(y_test, 
                                                                           y_pred)))  

Mean Absolute Error: 56.529
Mean Squared Error: 5017.250
Root Mean Squared Error: 70.833


In [26]:
mean = np.mean(y_test)
print('10% of Mean value of target: {:0.3f}'.format(np.mean(y_test)*0.1))

10% of Mean value of target: 15.422


#### Note: Error is more than 10 percent of mean of target values

### Let us try Linear Regression model

#### Create the model

In [27]:
lrmodel = LinearRegression()

#### Train the model

In [28]:
lrmodel.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

#### Predict target labels for test set

In [29]:
y_pred2 = lrmodel.predict(X_test)

#### Results of Linear Regression model

In [30]:
print('R-squared value on training dataset: %0.3f' %lrmodel.score(X_train, y_train))

R-squared value on training dataset: 0.554


In [31]:
print('R-squared value on test dataset: %0.3f' %lrmodel.score(X_test, y_test))

R-squared value on test dataset: 0.332


In [32]:
print('Mean Absolute Error: %0.3f' %metrics.mean_absolute_error(y_test, y_pred2))  
print('Mean Squared Error: %0.3f' %metrics.mean_squared_error(y_test, y_pred2))  
print('Root Mean Squared Error: %0.3f' %np.sqrt(metrics.mean_squared_error(y_test, 
                                                                           y_pred2))) 

Mean Absolute Error: 46.174
Mean Squared Error: 3424.317
Root Mean Squared Error: 58.518
