In [1]:
# read in the necessary packages
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


In [2]:
# read in the dataset
dataset = pd.read_excel('dataset.xlsx',converters={'诊断结果（0无；1轻度；2重度）':int})

# fill in the missing value with 0
col_set = dataset.columns
for column in col_set :
    dataset[column].fillna(0, inplace = True)

# split the target variable and trainning features
X = dataset.iloc[:,:-1]
y = dataset['诊断结果（0无；1轻度；2重度）']

In [3]:
# split training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(Y_train)
dummy_y_test = np_utils.to_categorical(Y_test)

In [4]:
# define neural network model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(11, input_dim=47, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [5]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
clf = estimator.fit(X_train,dummy_y_train)

y_predicted = clf.model.predict(X_test)

In [6]:
y_output = []
for each in y_predicted:
    a,b,c = each
    if max(a,b,c) == a:
        output = 0
    elif max(a,b,c) == b:
        output = 1
    else:
        output = 2
    y_output.append(output)

In [7]:
# prediction accuracy
np.mean(y_output == Y_test)

0.8210526315789474

In [8]:

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
precision_recall_fscore_support(Y_test, y_output, average=None)

(array([0.8       , 0.96296296, 0.6440678 ]),
 array([0.95238095, 0.71559633, 0.97435897]),
 array([0.86956522, 0.82105263, 0.7755102 ]),
 array([ 42, 109,  39], dtype=int64))

In [8]:
# evaluate the model with K-fold cross validation
kfold = KFold(n_splits=5, shuffle=True)
results = cross_val_score(estimator, X_test, dummy_y_test, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 66.32% (7.70%)
