In [90]:
#importing required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
# import data set
diabetes_data = pd.read_csv('diabetes.csv')

In [3]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
diabetes_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
# separate data into features and label
features = diabetes_data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
label = diabetes_data[['Outcome']]


In [9]:
features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [10]:
label.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


In [13]:
features_train, features_test, labels_train, labels_test = train_test_split(features, label, test_size=0.1, shuffle=True)

In [14]:
features_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
143,10,108,66,0,0,32.4,0.272,42
450,1,82,64,13,95,21.2,0.415,23
625,4,90,88,47,54,37.7,0.362,29
458,10,148,84,48,237,37.6,1.001,51
189,5,139,80,35,160,31.6,0.361,25


In [15]:
len(features_train)

691

In [16]:
len(features_test)

77

In [17]:
len(labels_train)

691

In [18]:
len(labels_test)

77

In [19]:
# create linear regression model
lr_model = LinearRegression()

In [22]:
# fit data into linear regression model
lr_model.fit(features_train, labels_train)

LinearRegression()

In [24]:
# make prediction on test features
predictions = lr_model.predict(features_test)

In [27]:
predictions[0:20]

array([[ 7.79122712e-01],
       [ 6.92145031e-01],
       [ 2.24342977e-01],
       [ 5.04114470e-01],
       [ 2.57844831e-04],
       [ 3.10760942e-01],
       [ 8.30811036e-01],
       [ 5.35098962e-01],
       [ 3.65606762e-01],
       [-3.10717483e-02],
       [ 7.28161876e-01],
       [-1.65141535e-01],
       [ 3.78859703e-01],
       [ 2.83127671e-01],
       [ 4.29733688e-01],
       [ 9.93338431e-02],
       [ 4.70339113e-01],
       [ 2.07486933e-01],
       [ 2.20193940e-01],
       [ 3.68376963e-02]])

In [28]:
# perform model evaluation with linear regression model
errors = mean_squared_error(labels_test, predictions, squared=False)

In [29]:
# the lower the mean squared error, the more efficient the model
errors

0.40826369606250895

In [30]:
# perform grid search with lasso
lasso_model = Lasso()

In [33]:
# declare lasso parameters
params = {
    'alpha': [0.001, 0.01,0.02, 0.024, 0.025, 0.026, 0.03,0.04, 0.05, 0.06, 0.08, 1, 2, 3, 5, 8, 10, 20, 50, 100],
    'max_iter': [10, 100, 1000, 10000, 100000, 1000000]
}

In [34]:
# create grid search cv with lasso
lasso_cv = GridSearchCV(lasso_model, params)

In [35]:
# fit data into lasso cv
lasso_cv.fit(features_train, labels_train)

  positive)
  positive)
  positive)


GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [0.001, 0.01, 0.02, 0.024, 0.025, 0.026, 0.03,
                                   0.04, 0.05, 0.06, 0.08, 1, 2, 3, 5, 8, 10,
                                   20, 50, 100],
                         'max_iter': [10, 100, 1000, 10000, 100000, 1000000]})

In [36]:
lasso_cv.best_params_

{'alpha': 0.001, 'max_iter': 100}

In [37]:
lasso_predictions = lasso_cv.predict(features_test)

In [39]:
lasso_predictions[0:20]

array([ 7.82911347e-01,  6.90860905e-01,  2.28205664e-01,  5.06135016e-01,
        1.40696433e-04,  3.08521206e-01,  8.34463654e-01,  5.38008299e-01,
        3.62597508e-01, -2.92600459e-02,  7.25303386e-01, -1.64201194e-01,
        3.79947538e-01,  2.83661670e-01,  4.30374871e-01,  9.96952980e-02,
        4.74281900e-01,  2.09759783e-01,  2.12101120e-01,  3.90225140e-02])

In [78]:
type(lasso_predictions)

numpy.ndarray

In [79]:
lasso_predictions.shape

(77,)

In [40]:
lasso_error = mean_squared_error(labels_test, lasso_predictions, squared=False)

In [41]:
lasso_error

0.4084950787704506

In [42]:
# create ridge classifier
ridge_model = Ridge()

In [43]:
# declare parameters for ridge model
ridge_params = {
    'alpha': [0, 1, 0.1, 0.01, 0.001, 0.0001, 0.02, 0.024, 0.025, 0.026, 0.03,0.04, 0.05, 0.06, 0.08, 1, 2, 3, 5, 8, 10, 20, 50, 100],
    'max_iter': [10, 100, 1000, 10000, 100000, 1000000],
    'fit_intercept': [True, False], 
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [44]:
# create grid search with ridge
ridge_cv = GridSearchCV(ridge_model, ridge_params)

In [45]:
# fit data into ridge cv
ridge_cv.fit(features_train, labels_train)

















































GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0, 1, 0.1, 0.01, 0.001, 0.0001, 0.02, 0.024,
                                   0.025, 0.026, 0.03, 0.04, 0.05, 0.06, 0.08,
                                   1, 2, 3, 5, 8, 10, 20, 50, 100],
                         'fit_intercept': [True, False],
                         'max_iter': [10, 100, 1000, 10000, 100000, 1000000],
                         'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg',
                                    'sag', 'saga']})

In [47]:
ridge_cv.best_params_

{'alpha': 10, 'fit_intercept': True, 'max_iter': 10, 'solver': 'svd'}

In [48]:
ridge_predictions = ridge_cv.predict(features_test)

In [71]:
ridge_predictions.shape

(77, 1)

In [50]:
ridge_error = mean_squared_error(labels_test, ridge_predictions, squared=False)

In [51]:
ridge_error

0.408639938814796

In [52]:
# create decision regressor model
regressor_model = DecisionTreeRegressor()

In [60]:
# create params for decision regressor model
regressor_params = {
    'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
    'splitter': ['best','random'],
    'max_features': ['auto','log2','sqrt',None],
}

In [61]:
# create grid search with regressor
regressor_cv = GridSearchCV(regressor_model, regressor_params)

In [62]:
# fit data to regressor cv model
regressor_cv.fit(features_train, labels_train)

GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
                         'max_features': ['auto', 'log2', 'sqrt', None],
                         'splitter': ['best', 'random']})

In [63]:
regressor_cv.best_params_

{'criterion': 'poisson', 'max_features': 'sqrt', 'splitter': 'random'}

In [80]:
regressor_predictions = regressor_cv.predict(features_test)

In [81]:
regressor_predictions.shape

(77,)

In [77]:
len(labels_test)

77

In [82]:
regressor_error = mean_squared_error(labels_test, regressor_predictions, squared=False)

In [83]:
regressor_error

0.4463509315542547

In [91]:
# export ridge model
with open('diabetes_ridge_model.pickle', 'wb') as f:
    pickle.dump(ridge_cv, f)

In [92]:
# load in ridge model to test
with open('diabetes_ridge_model.pickle', 'rb') as f:
    test_model = pickle.load(f)

In [93]:
preds = test_model.predict(features_test)

In [94]:
preds[0:20]

array([[ 7.86064763e-01],
       [ 6.90353795e-01],
       [ 2.31130706e-01],
       [ 5.07523226e-01],
       [-4.44860300e-04],
       [ 3.05983131e-01],
       [ 8.37904636e-01],
       [ 5.40132271e-01],
       [ 3.60029335e-01],
       [-2.85714310e-02],
       [ 7.23452062e-01],
       [-1.63937383e-01],
       [ 3.80549887e-01],
       [ 2.83843549e-01],
       [ 4.30498441e-01],
       [ 1.00224266e-01],
       [ 4.76948976e-01],
       [ 2.10861285e-01],
       [ 2.05929871e-01],
       [ 4.07222249e-02]])

array([0.04072222])

In [98]:
# test input
user = {
    'Pregnancies': 7,
    'Glucose': 34,
    'BloodPressure': 73.2,
    'SkinThickness': 43,
    'Insulin': 115,
    'BMI': 27.23,
    'DiabetesPedigreeFunction': 0.791,
    'Age': 23
}

In [99]:
user.items()

dict_items([('Pregnancies', 7), ('Glucose', 34), ('BloodPressure', 73.2), ('SkinThickness', 43), ('Insulin', 115), ('BMI', 27.23), ('DiabetesPedigreeFunction', 0.791), ('Age', 23)])

In [105]:
user_keys = user.keys()


In [110]:
user_keys = list(user_keys)

In [111]:
user_keys

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [115]:
user_df = pd.DataFrame([user])

In [117]:
user_df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,7,34,73.2,43,115,27.23,0.791,23


In [118]:
def parse_data(data_dict):
    df = pd.DataFrame([data_dict])
    return df

In [119]:
new_df = parse_data(user)

In [120]:
new_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,7,34,73.2,43,115,27.23,0.791,23


In [129]:
preds2 = test_model.predict(new_df)

In [130]:
new_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,7,34,73.2,43,115,27.23,0.791,23


In [149]:
# make test predictions
import requests

#url = 'https://diabetes-ml-model.herokuapp.com/predict'
url = 'http://0.0.0.0:8080/predict'
body = user

In [150]:
# make request
response = requests.post(url, json=body)

In [151]:
response.text.strip()

'{\n  "prediction": [\n    [\n      -0.1733685957014789\n    ]\n  ]\n}'

In [142]:
print(pickle.format_version)

4.0
