<a href="https://colab.research.google.com/github/wetherc/data-3000/blob/main/labs/02_20_neural_network_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [5]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

In [6]:
import pandas as pd

X = pd.DataFrame(X)

int_cols = [
    'BMI',
    'Education',
    'Income',
    'Age',
    'GenHlth',
    'MentHlth',
    'PhysHlth'
]

In [7]:
# Compute Z-scores for each numeric column in our dataset
# and rescale the original values

for col in int_cols:
    X[col] = (X[col] - X[col].mean()) / X[col].std()

In [8]:
X.loc[:, int_cols].describe()

Unnamed: 0,BMI,Education,Income,Age,GenHlth,MentHlth,PhysHlth
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,-2.455865e-16,-1.281711e-16,6.453368e-17,9.500792000000001e-17,2.097345e-16,-2.0839e-17,8.828566e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.478911,-4.108886,-2.440133,-2.302427,-1.414529,-0.4296288,-0.4865915
25%,-0.663121,-1.065593,-0.5088362,-0.6653479,-0.4786178,-0.4296288,-0.4865915
50%,-0.2091735,-0.05116143,0.456812,-0.01051634,-0.4786178,-0.4296288,-0.4865915
75%,0.3960898,0.9632697,0.9396361,0.6443152,0.4572934,-0.1598269,-0.1424739
max,10.53425,0.9632697,0.9396361,1.626563,2.329116,3.617399,2.954584


In [23]:
X['diet'] = X['Fruits'] + X['Veggies'] - X['HvyAlcoholConsump']

In [24]:
X['diet'].describe()

count    253680.000000
mean          1.389479
std           0.733420
min          -1.000000
25%           1.000000
50%           2.000000
75%           2.000000
max           2.000000
Name: diet, dtype: float64

In [27]:
X.drop(['Age'], axis=1, inplace=True)

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=42,
    train_size=0.7)

In [31]:
X_train.shape

(177576, 21)

In [32]:
import tensorflow as tf

In [33]:
inputs = {
    'GenHlth':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='GenHlth'),
    'BMI':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='BMI'),
    'PhysHlth':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='PhysHlth'),
    'diet':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='Diet'),
    'HighBP':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='HighBP'),
    'HighChol':
        tf.keras.layers.Input(
            shape=(1,),
            dtype=tf.float32,
            name='HighChol'),
}

In [None]:
preprocessing_layers = tf.keras.layers.Concatenate()(
    [inputs.get('GenHlth'), inputs.get('BMI'),  inputs.get('PhysHlth'),
     inputs.get('diet'),  inputs.get('HighBP'),  inputs.get('BMHighChol')])

hidden = tf.keras.layers.Dense(1000, activation='sigmoid')(preprocessing_layers)
hidden = tf.keras.layers.Dense(500, activation='sigmoid')(hidden)

output = tf.keras.layers.Dense(1, activation='sigmoid')(hidden)