In [128]:
import pandas as pd
import numpy as np
import altair as alt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [69]:
pd.options.display.float_format = '{:,.3f}'.format

In [4]:
data = pd.read_csv('diabetes_data.csv', delimiter=';')

In [8]:
data.head()

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,40,Male,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,Male,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,Male,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,Male,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,Male,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [52]:
data['gender'] = data['gender'].apply(lambda x: 1 if x=='Female' else 0)

In [11]:
data['class'].value_counts(normalize=True)

1    0.615385
0    0.384615
Name: class, dtype: float64

In [89]:
def do_chi2(df, cols, target):
  alpha = 0.05
  # tabular value for DOF = 1 and  alpha=0.05
  critical_value = 3.841

  significant_cols = []

  for col in cols:
    contingency_table = pd.crosstab(df[col], df[target])
    stat, p_val, __, __ = chi2_contingency(contingency_table)
    
    print(f"For {col} p_val: {p_val}, chi2_stat: {round(stat,2)}, critical value: {critical_value}")
    
    if p_val < alpha or stat > critical_value:
      print("Reject the H0. Variable is significant.")
      significant_cols.append(col)
    else:
      print('Fail to reject the H0.')
    
  return significant_cols

In [90]:
columns_for_chi2 = list(data.drop(['age','class'], axis=1).columns)

In [91]:
significant_cols = do_chi2(data, columns_for_chi2, 'class')

For gender p_val: 3.289703730553294e-24, chi2_stat: 103.04, critical value: 3.841
Reject the H0. Variable is significant.
For polyuria p_val: 1.7409117803442155e-51, chi2_stat: 227.87, critical value: 3.841
Reject the H0. Variable is significant.
For polydipsia p_val: 6.1870096408863144e-49, chi2_stat: 216.17, critical value: 3.841
Reject the H0. Variable is significant.
For sudden_weight_loss p_val: 5.969166262549937e-23, chi2_stat: 97.3, critical value: 3.841
Reject the H0. Variable is significant.
For weakness p_val: 4.869843446585542e-08, chi2_stat: 29.77, critical value: 3.841
Reject the H0. Variable is significant.
For polyphagia p_val: 1.1651584346409135e-14, chi2_stat: 59.6, critical value: 3.841
Reject the H0. Variable is significant.
For genital_thrush p_val: 0.016097902991938178, chi2_stat: 5.79, critical value: 3.841
Reject the H0. Variable is significant.
For visual_blurring p_val: 1.7015036753241196e-08, chi2_stat: 31.81, critical value: 3.841
Reject the H0. Variable is s

In [107]:
significant_cols + ['age','class']

['gender',
 'polyuria',
 'polydipsia',
 'sudden_weight_loss',
 'weakness',
 'polyphagia',
 'genital_thrush',
 'visual_blurring',
 'irritability',
 'partial_paresis',
 'muscle_stiffness',
 'alopecia',
 'age',
 'class']

In [120]:
scaler = MinMaxScaler()
data['age'] = scaler.fit_transform(np.array(data['age']).reshape((-1,1)))

In [123]:
dataset = data[significant_cols + ['age','class']]

In [124]:
dataset.head()

Unnamed: 0,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,irritability,partial_paresis,muscle_stiffness,alopecia,age,class
0,0,0,1,0,1,0,0,0,0,0,1,1,0.324,1
1,0,0,0,0,1,0,0,1,0,1,0,1,0.568,1
2,0,1,0,0,1,1,0,0,0,0,1,1,0.338,1
3,0,0,0,1,1,1,1,0,0,0,0,0,0.392,1
4,0,1,1,1,1,1,0,1,1,1,1,1,0.595,1


In [125]:
X = dataset[dataset.columns[:-1]].values
y = dataset[dataset.columns[-1]].values


In [129]:
X_train, X_temp, y_train, y_temp = train_test_split(X,y, test_size=0.4, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp,y_temp, test_size=0.5, random_state=0)

In [146]:
model = tf.keras.Sequential([
                             tf.keras.layers.Dense(16, activation='relu'),
                             tf.keras.layers.Dense(16, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

In [147]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [148]:
model.fit(X_train, y_train, batch_size=16, epochs=30, validation_data=(X_valid, y_valid))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fe079a22d90>

In [149]:
model.evaluate(X_test, y_test)



[0.16923364996910095, 0.9519230723381042]