In [206]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

In [207]:
col_names = ['id', 'Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG',
            'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope', 'HeartDisease']
data = pd.read_csv("heart_train.csv", header=0, skiprows=1, names=col_names, low_memory=False)

In [208]:
data.head()

Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
1,1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,4,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
5,5,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0


In [209]:
mapping_sex = {'M': 1, 'F': 0}
mapping_chest_pain = {'TA': 1, 'ATA': 2, 'NAP': 3, 'ASY': 4}
mapping_resting_ecg = {'Normal': 0, 'ST': 1, 'LVH': 2}
mapping_exercise_angina = {'Y': 1, 'N': 0}
mapping_st_slope = {'Up': 1, 'Flat': 2, 'Down': 3}

In [210]:
data['Sex'] = data['Sex'].map(mapping_sex)
data['ChestPainType'] = data['ChestPainType'].map(mapping_chest_pain)
data['RestingECG'] = data['RestingECG'].map(mapping_resting_ecg)
data['ExerciseAngina'] = data['ExerciseAngina'].map(mapping_exercise_angina)
data['ST_Slope'] = data['ST_Slope'].map(mapping_st_slope)

In [211]:
data.head()

Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
1,1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,4,39,1,3,120,339,0,0,170,0,0.0,1,0
5,5,54,1,2,110,208,0,0,142,0,0.0,1,0


In [212]:
data.drop_duplicates()

Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
1,1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,4,39,1,3,120,339,0,0,170,0,0.0,1,0
5,5,54,1,2,110,208,0,0,142,0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
638,638,45,1,1,110,264,0,0,132,0,1.2,2,1
639,639,68,1,4,144,193,1,0,141,0,3.4,2,1
640,640,57,1,4,130,131,0,0,115,1,1.2,2,1
641,641,57,0,2,130,236,0,2,174,0,0.0,2,1


In [213]:
independent_vars = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG',
                    'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']

X = data[independent_vars]
y = data['HeartDisease']

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [215]:
cart = DecisionTreeClassifier(ccp_alpha=0.01)
cart.fit(X_train, y_train)
y_pred = cart.predict(X_test)

In [216]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8604651162790697


In [None]:
def plot_confusionmatrix(y_train_pred,y_train,dom):
    print(f'{dom} Confusion matrix')
    cf = confusion_matrix(y_train_pred,y_train)
    sns.heatmap(cf,annot=True,yticklabels=classes
               ,xticklabels=classes,cmap='Blues', fmt='g')
    plt.tight_layout()
    plt.show()

In [217]:
data2 = pd.read_csv("heart_test.csv", header=0, names=col_names[:-1], low_memory=False)

In [218]:
data2.head()

Unnamed: 0,id,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,643,69,M,ASY,122,216,1,LVH,84,Y,0.0,Flat
1,644,74,M,TA,145,216,1,Normal,116,Y,1.8,Flat
2,645,66,F,NAP,146,278,0,LVH,152,N,0.0,Flat
3,646,53,M,ASY,144,300,1,ST,128,Y,1.5,Flat
4,647,41,F,ATA,105,198,0,Normal,168,N,0.0,Up


In [219]:
data2['Sex'] = data2['Sex'].map(mapping_sex)
data2['ChestPainType'] = data2['ChestPainType'].map(mapping_chest_pain)
data2['RestingECG'] = data2['RestingECG'].map(mapping_resting_ecg)
data2['ExerciseAngina'] = data2['ExerciseAngina'].map(mapping_exercise_angina)
data2['ST_Slope'] = data2['ST_Slope'].map(mapping_st_slope)

In [220]:
X_test_2 = data2[independent_vars]
X_id = data2['id']

In [221]:
cart2 = DecisionTreeClassifier(ccp_alpha=0.01)
cart2.fit(X_train, y_train)
y_pred_2 = cart2.predict(X_test_2)

In [222]:
predictions_df = pd.DataFrame({'id': X_id, 'output': y_pred_2})

In [223]:
predictions_df.to_csv('predictions.csv', index=False)