In [2]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from itertools import cycle
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import lasso_path, enet_path
from sklearn import datasets

In [62]:
diabetes_raw = pd.read_csv('diabetes_tab.txt', delimiter='\t', header=0)

In [63]:
diabetes_raw.columns = map(str.lower, diabetes_raw.columns)

display(diabetes_raw)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,59,2,32.1,101.00,157,93.2,38.0,4.00,4.8598,87,151
1,48,1,21.6,87.00,183,103.2,70.0,3.00,3.8918,69,75
2,72,2,30.5,93.00,156,93.6,41.0,4.00,4.6728,85,141
3,24,1,25.3,84.00,198,131.4,40.0,5.00,4.8903,89,206
4,50,1,23.0,101.00,192,125.4,52.0,4.00,4.2905,80,135
...,...,...,...,...,...,...,...,...,...,...,...
437,60,2,28.2,112.00,185,113.8,42.0,4.00,4.9836,93,178
438,47,2,24.9,75.00,225,166.0,42.0,5.00,4.4427,102,104
439,60,2,24.9,99.67,162,106.6,43.0,3.77,4.1271,95,132
440,36,1,30.0,95.00,201,125.2,42.0,4.79,5.1299,85,220


In [110]:
diabetes_mean = diabetes_raw.mean()
diabetes_std = diabetes_raw.std()

display(diabetes_mean)

display(diabetes_std)

age     48.518100
sex      1.468326
bmi     26.375792
bp      94.647014
s1     189.140271
s2     115.439140
s3      49.788462
s4       4.070249
s5       4.641411
s6      91.260181
y      152.133484
dtype: float64

age    13.109028
sex     0.499561
bmi     4.418122
bp     13.831283
s1     34.608052
s2     30.413081
s3     12.934202
s4      1.290450
s5      0.522391
s6     11.496335
y      77.093005
dtype: float64

In [65]:
def normalize(df):
    dfN = (df-diabetes_mean)/diabetes_std
    dfN["y"] = df["y"]
    return dfN

In [67]:
#diabetes_n = (diabetes_raw-diabetes_mean)/diabetes_std
diabetes_n = normalize(diabetes_raw)

display(diabetes_n)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,0.799594,1.064282,1.295620,0.459320,-0.928693,-0.731236,-0.911418,-0.054438,0.418057,-0.370569,151
1,-0.039522,-0.937474,-1.080955,-0.552878,-0.177423,-0.402430,1.562643,-0.829361,-1.434962,-1.936285,75
2,1.791277,1.064282,0.933475,-0.119079,-0.957588,-0.718084,-0.679475,-0.054438,0.060087,-0.544537,141
3,-1.870322,-0.937474,-0.243495,-0.769778,0.256002,0.524802,-0.756789,0.720486,0.476443,-0.196600,206
4,0.113044,-0.937474,-0.764079,0.459320,0.082632,0.327519,0.170984,-0.054438,-0.671740,-0.979458,135
...,...,...,...,...,...,...,...,...,...,...,...
437,0.875877,1.064282,0.412892,1.254619,-0.119633,-0.053896,-0.602160,-0.054438,0.655045,0.151337,178
438,-0.115806,1.064282,-0.334032,-1.420477,1.036167,1.662471,-0.602160,0.720486,-0.380388,0.934195,104
439,0.875877,1.064282,-0.334032,0.363161,-0.784218,-0.290636,-0.524846,-0.232670,-0.984533,0.325305,132
440,-0.954922,-0.937474,0.820305,0.025521,0.342687,0.320943,-0.602160,0.557752,0.935103,-0.544537,220


In [68]:
train, test = train_test_split(diabetes_n)

train_x = train.drop(["y"], axis=1)
test_x = test.drop(["y"], axis=1)

train_y = train[["y"]]
test_y = test[["y"]]




In [69]:
# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [70]:
alpha = 0.01
l1_ratio = 0.01

In [71]:
# Run ElasticNet
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
lr.fit(train_x, train_y)
predicted_qualities = lr.predict(test_x)
(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)


In [72]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(test_y)
#     print(predicted_qualities)

# Print out ElasticNet model metrics
print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)

Elasticnet model (alpha=0.010000, l1_ratio=0.010000):
  RMSE: 55.8164856610314
  MAE: 46.92032203420434
  R2: 0.4215780642440201


In [87]:
data1 = [{ "age" : 68, "sex" : 1, "bmi": 27.5, "bp": 107, "s1": 241, "s2": 149.6, "s3"  : 64, "s4"  : 4, "s5"  : 4.92, "s6"  : 90, "y": 143}]

# Should predict around 143

df1 = pd.DataFrame(data1)
display(df1)

#df1Norm = (newData1 - diabetes_mean) / diabetes_std
df1Norm = normalize(df1)

display(df1Norm)


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,68,1,27.5,107,241,149.6,64,4,4.92,90,143


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,1.486144,-0.937474,0.254454,0.893119,1.498487,1.123229,1.098756,-0.054438,0.533297,-0.109616,143


In [88]:
data2 = [{ "age" : 60, "sex" : 2, "bmi": 22.3, "bp": 113, "s1": 186, "s2": 125.8, "s3"  : 46, "s4"  : 4, "s5"  : 4.2627, "s6"  : 94, "y": 71}]


# Should predict around 143

df2 = pd.DataFrame(data2)
display(df2)

#newData2Norm = (newData2 - diabetes_mean) / diabetes_std
df2Norm = normalize(df2)

display(df2Norm)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,60,2,22.3,113,186,125.8,46,4,4.2627,94,71


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,0.875877,1.064282,-0.922517,1.326919,-0.090738,0.340671,-0.292903,-0.054438,-0.724957,0.238321,71


In [159]:
newPred1 = lr.predict(df1Norm.drop(["y"], axis=1))

print("newPred1 %f" % newPred1)

#(rmse1, mae1, r21) = eval_metrics([143], newPred1)

# Print out ElasticNet model metrics
#print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
#print("  RMSE: %s" % rmse1)
#print("  MAE: %s" % mae1)
#print("  R2: %s" % r21)

newPred1 182.430777


In [158]:
newPred2 = lr.predict(df2Norm.drop(["y"], axis=1))

print("newPred2 %f" % newPred2)

#(rmse2, mae2, r22) = eval_metrics([71], newPred2)

# Print out ElasticNet model metrics
#print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
#print("  RMSE: %s" % rmse2)
#print("  MAE: %s" % mae2)
#print("  R2: %s" % r22)

newPred2 134.458799


In [99]:
def predict(df):
    dfNorm = normalize(df)
    dfNorm = dfNorm.drop(["y"],axis=1)
    predictedValue = lr.predict(dfNorm)
    return predictedValue


In [101]:
display(df1)
print(predict(df1))
display(df2)
print(predict(df2))

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,68,1,27.5,107,241,149.6,64,4,4.92,90,143


[182.43077711]


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,60,2,22.3,113,186,125.8,46,4,4.2627,94,71


[134.45879862]


In [149]:
print(diabetes_mean.size)
print(diabetes_mean.dtype)

11
float64


In [150]:
diabetes_mean.to_csv("diabetes_mean_tab.txt", header=False, index=True)
diabetes_std.to_csv("diabetes_std_tab.txt", header=False, index=True)

In [153]:
new_d_mean = pd.read_csv("diabetes_mean_tab.txt", header=None, index_col=0, squeeze =True)

display(new_d_mean)

0
age     48.518100
sex      1.468326
bmi     26.375792
bp      94.647014
s1     189.140271
s2     115.439140
s3      49.788462
s4       4.070249
s5       4.641411
s6      91.260181
y      152.133484
Name: 1, dtype: float64

In [156]:
new_d_std = pd.read_csv("diabetes_std_tab.txt", header=None, index_col=0, squeeze =True)
display(new_d_std)

0
age    13.109028
sex     0.499561
bmi     4.418122
bp     13.831283
s1     34.608052
s2     30.413081
s3     12.934202
s4      1.290450
s5      0.522391
s6     11.496335
y      77.093005
Name: 1, dtype: float64

In [157]:
display( (df1-new_d_mean)/new_d_std )

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,y
0,1.486144,-0.937474,0.254454,0.893119,1.498487,1.123229,1.098756,-0.054438,0.533297,-0.109616,-0.118474
