In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Heart_Disease_Prediction.csv')
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,80,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,55,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,65,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,45,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [4]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium,heart_disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,presence
1,80,0,3,115,564,0,2,160,0,1.6,2,0,7,absence
2,55,1,2,124,261,0,0,141,0,0.3,1,0,7,presence
3,65,1,4,128,263,0,0,105,1,0.2,2,1,7,absence
4,45,0,2,120,269,0,2,121,1,0.2,1,1,3,absence


In [5]:
sex_values = {
    0: "female",
    1: "male"
}
df.sex = df.sex.map(sex_values)

fbs_values = {
    0: "false",
    1: "true"
}
df.fbs_over_120 = df.fbs_over_120.map(fbs_values)

exercise_angina_values = {
    0: "no",
    1: "yes"
}
df.exercise_angina = df.exercise_angina.map(exercise_angina_values)


In [6]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium,heart_disease
0,70,male,4,130,322,False,2,109,no,2.4,2,3,3,presence
1,80,female,3,115,564,False,2,160,no,1.6,2,0,7,absence
2,55,male,2,124,261,False,0,141,no,0.3,1,0,7,presence
3,65,male,4,128,263,False,0,105,yes,0.2,2,1,7,absence
4,45,female,2,120,269,False,2,121,yes,0.2,1,1,3,absence


In [7]:
df.heart_disease = (df.heart_disease == "presence").astype(int)

In [8]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium,heart_disease
0,70,male,4,130,322,False,2,109,no,2.4,2,3,3,1
1,80,female,3,115,564,False,2,160,no,1.6,2,0,7,0
2,55,male,2,124,261,False,0,141,no,0.3,1,0,7,1
3,65,male,4,128,263,False,0,105,yes,0.2,2,1,7,0
4,45,female,2,120,269,False,2,121,yes,0.2,1,1,3,0


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [11]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [12]:
len(df_train), len(df_val), len(df_test)

(162, 54, 54)

In [13]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [14]:
df_train

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium,heart_disease
0,62,female,3,130,263,false,0,97,no,1.2,2,1,7,1
1,61,female,4,145,307,false,2,146,yes,1.0,2,0,7,1
2,54,male,4,124,266,false,2,109,yes,2.2,2,1,7,1
3,58,female,4,100,248,false,2,122,no,1.0,2,0,3,0
4,68,male,3,180,274,true,2,150,yes,1.6,2,0,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,44,male,4,112,290,false,2,153,no,0.0,1,1,3,1
158,44,female,3,108,141,false,0,175,no,0.6,2,0,3,0
159,53,male,4,140,203,true,2,155,yes,3.1,3,0,7,1
160,62,male,4,120,267,false,0,99,yes,1.8,2,2,7,1


In [15]:
y_train = df_train.heart_disease.values
y_val = df_val.heart_disease.values
y_test = df_test.heart_disease.values

In [16]:
del df_train['heart_disease']
del df_val['heart_disease']
del df_test['heart_disease']

In [17]:
df_full_train.isnull().sum()

age                        0
sex                        0
chest_pain_type            0
bp                         0
cholesterol                0
fbs_over_120               0
ekg_results                0
max_hr                     0
exercise_angina            0
st_depression              0
slope_of_st                0
number_of_vessels_fluro    0
thallium                   0
heart_disease              0
dtype: int64

In [18]:
df_full_train = df_full_train.reset_index(drop=True)

In [19]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,bp,cholesterol,fbs_over_120,ekg_results,max_hr,exercise_angina,st_depression,slope_of_st,number_of_vessels_fluro,thallium,heart_disease
0,70,male,4,130,322,False,2,109,no,2.4,2,3,3,1
1,80,female,3,115,564,False,2,160,no,1.6,2,0,7,0
2,55,male,2,124,261,False,0,141,no,0.3,1,0,7,1
3,65,male,4,128,263,False,0,105,yes,0.2,2,1,7,0
4,45,female,2,120,269,False,2,121,yes,0.2,1,1,3,0


In [20]:
df_full_train.heart_disease.value_counts(normalize=True)

heart_disease
0    0.550926
1    0.449074
Name: proportion, dtype: float64

In [21]:
df_full_train.heart_disease.mean()

0.44907407407407407

In [22]:
df_full_train.dtypes

age                          int64
sex                         object
chest_pain_type              int64
bp                           int64
cholesterol                  int64
fbs_over_120                object
ekg_results                  int64
max_hr                       int64
exercise_angina             object
st_depression              float64
slope_of_st                  int64
number_of_vessels_fluro      int64
thallium                     int64
heart_disease                int64
dtype: object

In [23]:
numerical = ["age","bp","cholesterol","max_hr","st_depression"]

In [24]:
df_full_train.columns

Index(['age', 'sex', 'chest_pain_type', 'bp', 'cholesterol', 'fbs_over_120',
       'ekg_results', 'max_hr', 'exercise_angina', 'st_depression',
       'slope_of_st', 'number_of_vessels_fluro', 'thallium', 'heart_disease'],
      dtype='object')

In [25]:
categorical = ['sex', 'chest_pain_type', 'fbs_over_120',
       'ekg_results', 'exercise_angina',
       'slope_of_st', 'number_of_vessels_fluro', 'thallium']

In [26]:
df_full_train[categorical].nunique()

sex                        2
chest_pain_type            4
fbs_over_120               2
ekg_results                3
exercise_angina            2
slope_of_st                3
number_of_vessels_fluro    4
thallium                   3
dtype: int64

In [27]:
global_heart_disease_rate = df_full_train.heart_disease.mean()
global_heart_disease_rate

0.44907407407407407

In [28]:
heart_disease_female = df_full_train[df_full_train.sex == 'female'].heart_disease.mean()
heart_disease_female

0.25

In [29]:
heart_disease_male = df_full_train[df_full_train.sex == 'male'].heart_disease.mean()
heart_disease_male

0.5571428571428572

In [30]:
from IPython.display import display

In [31]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).heart_disease.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_heart_disease_rate
    df_group['risk'] = df_group['mean'] / global_heart_disease_rate
    display(df_group)
    print()
    print()

sex


Unnamed: 0_level_0,mean,count,diff,risk
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.25,76,-0.199074,0.556701
male,0.557143,140,0.108069,1.240648




chest_pain_type


Unnamed: 0_level_0,mean,count,diff,risk
chest_pain_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.214286,14,-0.234788,0.477172
2,0.205882,34,-0.243192,0.45846
3,0.229508,61,-0.219566,0.51107
4,0.682243,107,0.233169,1.519222




fbs_over_120


Unnamed: 0_level_0,mean,count,diff,risk
fbs_over_120,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.450549,182,0.001475,1.003285
True,0.441176,34,-0.007898,0.982414




ekg_results


Unnamed: 0_level_0,mean,count,diff,risk
ekg_results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.346154,104,-0.10292,0.770817
1,1.0,1,0.550926,2.226804
2,0.540541,111,0.091466,1.203678




exercise_angina


Unnamed: 0_level_0,mean,count,diff,risk
exercise_angina,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.30137,146,-0.147704,0.671092
yes,0.757143,70,0.308069,1.686009




slope_of_st


Unnamed: 0_level_0,mean,count,diff,risk
slope_of_st,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.271028,107,-0.178046,0.603526
2,0.630435,92,0.181361,1.403855
3,0.588235,17,0.139161,1.309885




number_of_vessels_fluro


Unnamed: 0_level_0,mean,count,diff,risk
number_of_vessels_fluro,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.264,125,-0.185074,0.587876
1,0.617021,47,0.167947,1.373986
2,0.793103,29,0.344029,1.766086
3,0.8,15,0.350926,1.781443




thallium


Unnamed: 0_level_0,mean,count,diff,risk
thallium,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,0.196721,122,-0.252353,0.43806
6,0.625,8,0.175926,1.391753
7,0.790698,86,0.341624,1.760729






In [32]:
from sklearn.metrics import mutual_info_score

In [33]:
def mutual_info_heart_disease_score(series):
    return mutual_info_score(series, df_full_train.heart_disease)

In [34]:
mi = df_full_train[categorical].apply(mutual_info_heart_disease_score)
mi.sort_values(ascending=False)

thallium                   0.179138
chest_pain_type            0.112424
number_of_vessels_fluro    0.105918
exercise_angina            0.094620
slope_of_st                0.064628
sex                        0.045073
ekg_results                0.022870
fbs_over_120               0.000024
dtype: float64

In [35]:
df_full_train[numerical].corrwith(df_full_train.heart_disease).abs()

age              0.204550
bp               0.175619
cholesterol      0.098109
max_hr           0.411571
st_depression    0.405581
dtype: float64

In [36]:
from sklearn.feature_extraction import DictVectorizer

In [37]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [39]:
dv = DictVectorizer(sparse=False)

In [40]:
X_train = dv.fit_transform(train_dicts)

In [41]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [42]:
X_val = dv.transform(val_dicts)

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
model.predict(X_train)

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0])

In [48]:
model.predict_proba(X_train)

array([[0.27441019, 0.72558981],
       [0.2138076 , 0.7861924 ],
       [0.00833254, 0.99166746],
       [0.77908673, 0.22091327],
       [0.21370307, 0.78629693],
       [0.26071027, 0.73928973],
       [0.02911291, 0.97088709],
       [0.90579712, 0.09420288],
       [0.98527762, 0.01472238],
       [0.99033416, 0.00966584],
       [0.55106069, 0.44893931],
       [0.02980656, 0.97019344],
       [0.97396305, 0.02603695],
       [0.8099471 , 0.1900529 ],
       [0.93101253, 0.06898747],
       [0.99548215, 0.00451785],
       [0.06204985, 0.93795015],
       [0.08724911, 0.91275089],
       [0.982644  , 0.017356  ],
       [0.86695255, 0.13304745],
       [0.80958188, 0.19041812],
       [0.95779989, 0.04220011],
       [0.79763932, 0.20236068],
       [0.94808401, 0.05191599],
       [0.02979706, 0.97020294],
       [0.02036849, 0.97963151],
       [0.74403507, 0.25596493],
       [0.96744137, 0.03255863],
       [0.84856512, 0.15143488],
       [0.08706716, 0.91293284],
       [0.

In [49]:
y_pred = model.predict_proba(X_train)[:,1]

In [51]:
heart_disease_decision = (y_pred >= 0.5)

In [52]:
heart_disease_decision

array([ True,  True,  True, False,  True,  True,  True, False, False,
       False, False,  True, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True,  True, False,
       False, False,  True,  True, False,  True,  True, False, False,
        True, False, False,  True,  True, False,  True,  True, False,
       False,  True, False,  True, False,  True, False,  True,  True,
        True, False,  True,  True, False, False, False, False, False,
       False, False, False,  True, False, False,  True,  True, False,
       False,  True,  True,  True, False,  True, False,  True,  True,
        True, False, False, False,  True, False,  True,  True, False,
       False,  True,  True,  True,  True,  True, False,  True, False,
       False, False, False,  True,  True, False,  True,  True,  True,
       False, False, False,  True, False, False,  True, False, False,
       False, False,  True,  True,  True,  True,  True, False, False,
        True, False,

In [53]:
(y_train == heart_disease_decision).mean()

0.8950617283950617

In [54]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = heart_disease_decision.astype(int)
df_pred['actual'] = y_train

In [55]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [56]:
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.725590,1,1,True
1,0.786192,1,1,True
2,0.991667,1,1,True
3,0.220913,0,0,True
4,0.786297,1,1,True
...,...,...,...,...
157,0.336313,0,1,False
158,0.009008,0,0,True
159,0.934380,1,1,True
160,0.992243,1,1,True


In [57]:
df_pred.correct.mean()

0.8950617283950617

In [59]:
model = LogisticRegression()
model.fit(X_val, y_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
y_pred = model.predict_proba(X_val)[:,1]

In [62]:
heart_disease_decision

array([False, False, False, False,  True,  True,  True, False,  True,
       False, False, False,  True,  True,  True,  True,  True, False,
       False, False, False,  True, False, False, False,  True, False,
       False, False,  True, False,  True,  True, False,  True, False,
       False, False,  True,  True,  True, False, False,  True, False,
        True,  True,  True, False, False,  True, False,  True, False])

In [63]:
(y_val == heart_disease_decision).mean()

0.8888888888888888

In [64]:
test_dicts = df_test[categorical + numerical].to_dict(orient='records')

In [65]:
X_test = dv.transform(test_dicts)

In [66]:
model = LogisticRegression()
model.fit(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
y_pred = model.predict_proba(X_test)[:,1]

In [69]:
heart_disease_decision = (y_pred >= 0.5)

In [72]:
(y_test == heart_disease_decision).mean()

0.8888888888888888

In [73]:
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

In [74]:
X_full_train = dv.transform(dicts_full_train)

In [76]:
y_full_train = df_full_train.heart_disease.values

In [77]:
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
y_pred = model.predict_proba(X_test)[:,1]

In [79]:
heart_disease_decision = (y_pred >= 0.5)

In [80]:
(y_test == heart_disease_decision).mean()

0.8333333333333334