## Load data and Pre-processing

In [5]:
import pandas as pd

df  = pd.read_csv('heart.csv', sep=',')

# Check duplicate value
print('Duplicated row:', df.duplicated().sum())
df.drop_duplicates(inplace=True)
print('Duplicated row after dropped:', df.duplicated().sum())

# Checking thal coloumns if there is any null values (represented as 0)
print(df['thal'].value_counts())
df.drop(df[df['thal'] == 0].index, inplace=True) # Drop 0's as they mean missing value
print(df['thal'].value_counts())

data = df.values
features = data[:, 0:13]
target = data[:, 13]

print('Features shape: ', features.shape)
print('Target shape: ', target.shape)

Duplicated row: 1
Duplicated row after dropped: 0
2    165
3    117
1     18
0      2
Name: thal, dtype: int64
2    165
3    117
1     18
Name: thal, dtype: int64
Features shape:  (300, 13)
Target shape:  (300,)


## Split dataset using StratifiedShuffleSplit to make data stratified

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=1)

for train_index, test_index in sss.split(features, target):
    xtrain, xtest = features[train_index], features[test_index]
    ytrain, ytest = target[train_index], target[test_index]

print("xtrain shape", xtrain.shape)
print("ytrain shape", ytrain.shape)
print("xtest shape", xtest.shape)
print("ytest shape", ytest.shape)

xtrain shape (210, 13)
ytrain shape (210,)
xtest shape (90, 13)
ytest shape (90,)


## Standardization using StandardScaler

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

## Build classification model using ANN method

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

for nh in range(1,16): 
    ann = MLPClassifier(hidden_layer_sizes = (nh,), max_iter = 10000, random_state = 1)
    ann.fit(xtrain,ytrain)
    ypred = ann.predict(xtest)
    acc = 100*accuracy_score(ytest,ypred)
    print("Neuron in hidden layer: %g, test accuracy %.2f %%"%(nh, acc))
    ypredtrain = ann.predict(xtrain)
    acc = 100*accuracy_score(ytrain, ypredtrain)
    print("Neuron in hidden layer: %g, train accuracy %.2f %%"%(nh, acc))
    accuracy = cross_val_score(ann, features, target, cv=10)
    print('Mean accuracy: %3.2f%%' %(100*accuracy.mean()))
    print("")

Neuron in hidden layer: 1, test accuracy 80.00 %
Neuron in hidden layer: 1, train accuracy 85.24 %
Mean accuracy: 54.33%

Neuron in hidden layer: 2, test accuracy 82.22 %
Neuron in hidden layer: 2, train accuracy 79.52 %
Mean accuracy: 63.00%

Neuron in hidden layer: 3, test accuracy 83.33 %
Neuron in hidden layer: 3, train accuracy 86.19 %
Mean accuracy: 59.33%

Neuron in hidden layer: 4, test accuracy 84.44 %
Neuron in hidden layer: 4, train accuracy 87.62 %
Mean accuracy: 45.67%

Neuron in hidden layer: 5, test accuracy 87.78 %
Neuron in hidden layer: 5, train accuracy 86.19 %
Mean accuracy: 84.67%

Neuron in hidden layer: 6, test accuracy 85.56 %
Neuron in hidden layer: 6, train accuracy 87.62 %
Mean accuracy: 77.67%

Neuron in hidden layer: 7, test accuracy 87.78 %
Neuron in hidden layer: 7, train accuracy 89.05 %
Mean accuracy: 62.00%

Neuron in hidden layer: 8, test accuracy 81.11 %
Neuron in hidden layer: 8, train accuracy 88.10 %
Mean accuracy: 83.33%

Neuron in hidden layer: 

## Build classification model using Logistic Regression method

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

logr = LogisticRegression(solver='liblinear')
logr.fit(xtrain, ytrain)
ypred = logr.predict(xtest)
acc = 100 * accuracy_score(ytest, ypred)
print('Test accuracy: %2.2f %%'%acc)

ypredtrain = logr.predict(xtrain)
acc = 100 * accuracy_score(ytrain, ypredtrain)
print('Train accuracy: %2.2f %%'%acc)

accuracy = cross_val_score(logr, features, target, cv=10)
print('Mean accuracy: %2.2f%%' %(100*accuracy.mean()))

print(classification_report(ytest, ypred)) #Support adalah jumlah kelas yang sebenarnya

Test accuracy: 86.67 %
Train accuracy: 86.67 %
Mean accuracy: 84.00%
              precision    recall  f1-score   support

         0.0       0.91      0.78      0.84        41
         1.0       0.84      0.94      0.88        49

    accuracy                           0.87        90
   macro avg       0.88      0.86      0.86        90
weighted avg       0.87      0.87      0.87        90



## Classification report on the best model

In [13]:
from sklearn.metrics import classification_report

ann = MLPClassifier(hidden_layer_sizes = (9,), max_iter = 10000, random_state = 1)
ann.fit(xtrain,ytrain)
ypred = ann.predict(xtest)
acc = 100*accuracy_score(ytest,ypred)
print("Neuron in hidden layer: %g, test accuracy %.2f %%"%(9, acc))

ypredtrain = ann.predict(xtrain)
acc = 100*accuracy_score(ytrain, ypredtrain)
print("Neuron in hidden layer: %g, train accuracy %.2f %%"%(9, acc))

accuracy = cross_val_score(ann, features, target, cv=10)
print('Mean accuracy: %3.2f%%' %(100*accuracy.mean()))
print("")

print(classification_report(ytest, ypred)) #Support adalah jumlah kelas yang sebenarnya

Neuron in hidden layer: 9, test accuracy 88.89 %
Neuron in hidden layer: 9, train accuracy 90.48 %
Mean accuracy: 83.00%

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.88        41
         1.0       0.88      0.92      0.90        49

    accuracy                           0.89        90
   macro avg       0.89      0.89      0.89        90
weighted avg       0.89      0.89      0.89        90

