# Assignment 4

## Import the diabetes dataset from scikit-learn

In [1]:
# Import scikit-learn's example diabetes dataset with the following code:
import sklearn.datasets
diabetes = sklearn.datasets.load_diabetes()

In [2]:
# Print a description of the dataset with:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [3]:
# Get the features and target arrays with:
x = diabetes.data
y = diabetes.target

In [4]:
# We are going to use ALL the features stored in x to do regression. And we are NOT going to make any plots this time.

## Compare the results of doing regression on this dataset with k-nearest neighbors, linear regression, decision tree regression, and random forest regression

In [5]:
import numpy as np
import seaborn as sns
import pandas as pd
import sklearn.linear_model
import sklearn.model_selection
import sklearn.neighbors
import sklearn.tree
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

### Linear Regression

In [6]:
model1 = sklearn.linear_model.LinearRegression()

In [7]:
# Split your x and y data into training and test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=42)

In [8]:
# Train the model
model1.fit(x_train,y_train)

LinearRegression()

In [9]:
# Make a prediction
y_pred1 = model1.predict(x_test)

In [10]:
x_train.shape

(353, 10)

In [11]:
x_test.shape

(89, 10)

In [12]:
# Print the MSE of the predictions relative to the true y values of the data
x_model_vals = np.linspace(0, 10, 50).reshape(-1,10)
y_model_vals = model1.predict(x_model_vals)

print('Intercept = ', model1.intercept_)
print('Model coefficients = ', model1.coef_)
print('MSE_linrig = ', mean_squared_error(y_test, y_pred1))

Intercept =  151.3456553477407
Model coefficients =  [  37.90031426 -241.96624835  542.42575342  347.70830529 -931.46126093
  518.04405547  163.40353476  275.31003837  736.18909839   48.67112488]
MSE_linrig =  2900.1732878832318


### k-nearest neighbors

In [13]:
model2 = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)

In [14]:
def knntest(n=3):
    model2 = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n)

    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, test_size=0.2, random_state=42)

    model2.fit(x_train, y_train)

    y_pred2 = model2.predict(x_test)

    print('MSE_knn = ', mean_squared_error(y_test, y_pred2))

In [15]:
# Split your x and y data into training and test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=42)

In [16]:
# Train the model
model2.fit(x_train,y_train)

KNeighborsRegressor(n_neighbors=3)

In [17]:
# Make a prediction
y_pred2 = model2.predict(x_test)

In [18]:
loss = cross_val_score(model2,
                       x_train,
                       y_train, 
                       cv=5, 
                       scoring='neg_mean_squared_error')
loss

array([-4250.91236307, -4645.72143975, -4536.07668232, -4767.94920635,
       -3825.44603175])

In [19]:
np.mean(np.sqrt(-loss))

66.3218962404476

In [20]:
k_range = range(1, 20)
k_scores = []

for k in k_range:
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k)
    loss = cross_val_score(knn,
                           x_train,
                           y_train, 
                           cv=5, 
                           scoring='neg_mean_squared_error')
    k_scores.append(np.sqrt(-loss).mean())

In [21]:
knntest(n=10)

MSE_knn =  3115.359438202248


### Decision Tree Regression

In [22]:
model3 = sklearn.tree.DecisionTreeRegressor(max_depth=3)

In [23]:
def dtMax(n=1):
    model3 = sklearn.tree.DecisionTreeRegressor(max_depth=n)

    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, test_size=0.2, random_state=42)
    
    model3.fit(x_train,y_train)
    
    y_pred3 = model3.predict(x_test)
    
    print('MSE_depth',n,' = ', mean_squared_error(y_test, y_pred3))
    
    text_representation = sklearn.tree.export_text(model3)
    print(text_representation)

In [24]:
# Split your x and y data into training and test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=42)

In [25]:
# Train the model
model3.fit(x_train,y_train)

DecisionTreeRegressor(max_depth=3)

In [26]:
# Make a prediction
y_pred3 = model3.predict(x_test)

In [27]:
loss = cross_val_score(model3,
                       x_train,
                       y_train, 
                       cv=5, 
                       scoring='neg_mean_squared_error')
loss

array([-3967.33998151, -5158.02467042, -3516.99932374, -4685.11190454,
       -3919.4934666 ])

In [28]:
np.mean(np.sqrt(-loss))

65.03284855777811

In [29]:
dtMax(n=3)

MSE_depth 3  =  3656.186930948001
|--- feature_2 <= 0.01
|   |--- feature_8 <= 0.01
|   |   |--- feature_8 <= -0.04
|   |   |   |--- value: [80.88]
|   |   |--- feature_8 >  -0.04
|   |   |   |--- value: [109.92]
|   |--- feature_8 >  0.01
|   |   |--- feature_7 <= 0.09
|   |   |   |--- value: [159.57]
|   |   |--- feature_7 >  0.09
|   |   |   |--- value: [256.33]
|--- feature_2 >  0.01
|   |--- feature_2 <= 0.07
|   |   |--- feature_9 <= 0.03
|   |   |   |--- value: [175.80]
|   |   |--- feature_9 >  0.03
|   |   |   |--- value: [230.52]
|   |--- feature_2 >  0.07
|   |   |--- feature_5 <= 0.02
|   |   |   |--- value: [291.22]
|   |   |--- feature_5 >  0.02
|   |   |   |--- value: [225.75]



### Random Forest Regression

In [30]:
model4 = sklearn.ensemble.RandomForestRegressor(max_depth=3)

In [31]:
def rfMax(n=1):
    model4 = sklearn.ensemble.RandomForestRegressor(max_depth=2)

    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, test_size=0.2, random_state=42)
    
    model4.fit(x_train,
          y_train)
    y_pred4 = model4.predict(x_test)
    
    print('MSE_depth',n,' = ', mean_squared_error(y_test, y_pred4))
    
    text_representation = sklearn.tree.export_text(model4.estimators_[0])
    print(text_representation)

In [32]:
# Split your x and y data into training and test sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=42)

In [33]:
# Train the model
model4.fit(x_train,y_train)

RandomForestRegressor(max_depth=3)

In [34]:
# Make a prediction
y_pred4 = model4.predict(x_test)

In [35]:
loss = cross_val_score(model4,
                       x_train,
                       y_train, 
                       cv=5, 
                       scoring='neg_mean_squared_error')
loss

array([-2972.81712804, -4309.63435091, -2914.32531442, -3536.45856485,
       -3181.7104444 ])

In [36]:
np.mean(np.sqrt(-loss))

58.00612529970745

In [37]:
rfMax(n=2)

MSE_depth 2  =  3002.8033606871018
|--- feature_2 <= 0.01
|   |--- feature_8 <= 0.01
|   |   |--- value: [103.55]
|   |--- feature_8 >  0.01
|   |   |--- value: [168.41]
|--- feature_2 >  0.01
|   |--- feature_3 <= 0.02
|   |   |--- value: [174.00]
|   |--- feature_3 >  0.02
|   |   |--- value: [242.82]



## Conclusion

#### Linear Regression
2900.1732878832318

#### k-nearest neighbors
3115.359438202248

#### Decision Tree Regression
3656.186930948001

#### Random Forest Regression
2989.943511307444

 <hr> 

#### **<u>Linear Regression</u> performed the best out of the four.**