In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import scipy.stats as st

In [2]:
import statsmodels.api as sm

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
import matplotlib.mlab as mlab
%matplotlib inline
from sklearn import metrics


In [4]:
#!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.8.0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold


# Introduction to DataSet

- World Health Organization has  estimated 12 million deaths occur worldwide, every year due to Heart diseases.

- Half the deaths in the United States and other developed countries are due to cardio vascular diseases. 

- The early prognosis of cardiovascular diseases can aid in making decisions on lifestyle changes in high risk patients and in turn reduce the complications.

- This research intends to pinpoint the most relevant/risk factors of heart disease as well as predict the overall risk using logistic regression.


# Data Preparation
The dataset is publically available and it is from an ongoing ongoing cardiovascular study on residents of the town of Framingham, Massachusetts.  The classification goal is to predict whether the patient has 10-year risk of future coronary heart disease (CHD).The dataset provides the patients’ information. It includes over 4,000 records and 15 attributes.


In [5]:
df=pd.read_csv('US_Heart_Patients.csv')

In [6]:
df.head()

Unnamed: 0,gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [7]:
df.shape

(4240, 16)

In [9]:
from pandas_profiling import ProfileReport
hd_profile=ProfileReport(df)
hd_profile.to_file('hd_profil_html')

Summarize dataset:   0%|          | 0/29 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
table=pd.crosstab(df.TenYearCHD,df.education)
st.chi2_contingency(table)

(31.051850034062287,
 8.289453270159487e-07,
 3,
 array([[1547.80660377, 1062.68584906,  584.35      ,  401.15754717],
        [ 277.19339623,  190.31415094,  104.65      ,   71.84245283]]))

In [12]:
df.education.value_counts(dropna=False)

1.0    1720
2.0    1253
3.0     689
4.0     473
NaN     105
Name: education, dtype: int64

In [15]:
df.education.isnull().sum()

105

In [16]:
df.education=df.education.fillna(1)

In [17]:
df.education.value_counts(dropna=False)

1.0    1825
2.0    1253
3.0     689
4.0     473
Name: education, dtype: int64

In [18]:
df.BPMeds.value_counts(dropna=False)

0.0    4063
1.0     124
NaN      53
Name: BPMeds, dtype: int64

In [20]:
df.BPMeds=df.BPMeds.fillna(0)
df.BPMeds.value_counts(dropna=False)

0.0    4116
1.0     124
Name: BPMeds, dtype: int64

In [21]:
df.isnull().sum()

gender               0
age                  0
education            0
currentSmoker        0
cigsPerDay          29
BPMeds               0
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [24]:
df.cigsPerDay.value_counts(dropna=False)

NaN       4207
3.0          4
1.0          3
18.0         2
11.0         2
2.0          2
6.0          2
12.0         1
143.0        1
7.0          1
8.0          1
80.0         1
734.0        1
55.0         1
2145.0       1
130.0        1
9.0          1
100.0        1
22.0         1
218.0        1
5.0          1
210.0        1
67.0         1
121.0        1
56.0         1
Name: cigsPerDay, dtype: int64

In [25]:
df.cigsPerDay=df.cigsPerDay.fillna(0)

In [26]:
df.totChol=df.totChol.fillna(df.totChol.mean())

In [27]:
df.BMI=df.BMI.fillna(df.BMI.mean())

In [28]:
df.heartRate=df.heartRate.fillna(df.heartRate.mean())

In [29]:
df.isnull().sum()

gender               0
age                  0
education            0
currentSmoker        0
cigsPerDay           0
BPMeds               0
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol              0
sysBP                0
diaBP                0
BMI                  0
heartRate            0
glucose            388
TenYearCHD           0
dtype: int64

In [30]:
df.glucose=df.glucose.fillna(df.glucose.mean())

In [31]:
df.isnull().sum()

gender             0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [33]:
from statsmodels.tools import add_constant as add_constant
df_constant = add_constant(df)
df_constant.head()


Unnamed: 0,const,gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1.0,1,39,4.0,0,2145.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,1.0,0,46,2.0,0,67.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1.0,1,48,1.0,1,18.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,1.0,0,61,3.0,1,100.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,1.0,0,46,3.0,1,9.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [34]:
df_constant.shape

(4240, 17)

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
x=df.drop('TenYearCHD',axis=1)
y=df.TenYearCHD

In [38]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=.2)

In [41]:
y_test.shape

(848,)

In [42]:
from sklearn.linear_model import LogisticRegression

In [51]:
## Using sklearn
from sklearn.linear_model import LogisticRegression
logit_reg = LogisticRegression()
model1 = logit_reg.fit(x_train,y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [49]:
pred_y=model1.predict(x_test)
pred_df=pd.DataFrame(pred_y)
pred_df.value_counts()

0    839
1      9
dtype: int64

#### <font color=darkblue>Logistic regression equation<font>

$$P=\hspace{.2cm}e^{\beta_0 + \beta_1 X_1}\hspace{.2cm}/\hspace{.2cm}1+e^{\beta_0 +\beta_1 X_1}$$

When all features plugged in:

$$logit(p) = log(p/(1-p))=\beta_0 +\beta_1\hspace{.1cm} *\hspace{.2cm} Sexmale\hspace{.2cm}+\beta_2\hspace{.1cm} * \hspace{.1cm}age\hspace{.2cm}+\hspace{.2cm}\beta_3\hspace{.1cm} *\hspace{.1cm} cigsPerDay\hspace{.2cm}+\hspace{.2cm}\beta_4 \hspace{.1cm}*\hspace{.1cm} totChol\hspace{.2cm}+\hspace{.2cm}\beta_5\hspace{.1cm} *\hspace{.1cm} sysBP\hspace{.2cm}+\hspace{.2cm}\beta_6\hspace{.1cm} *\hspace{.1cm} glucose\hspace{.2cm}$$


In [53]:
# new_features=df[['age','gender','education','cigsPerDay','prevalentStroke','prevalentHyp','diabetes','sysBP','diaBP','BMI','heartRate','TenYearCHD']]
new_df = df.drop(['currentSmoker','BPMeds','totChol','glucose'],axis=1)
x=new_df.drop('TenYearCHD',axis=1)
y=new_df.TenYearCHD
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1=train_test_split(x,y,test_size=.20,random_state=5)


In [54]:
model3=logit_reg.fit(x_train1,y_train1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
pred_y1=model3.predict(x_test1)
pred_df1=pd.DataFrame(pred_y1)
pred_df1.value_counts()

0    841
1      7
dtype: int64

In [56]:
y_test1.value_counts()

0    715
1    133
Name: TenYearCHD, dtype: int64

In [57]:
from sklearn.metrics import confusion_matrix

In [60]:
cm1=confusion_matrix(pred_y1,y_test1)
cm1

array([[712, 129],
       [  3,   4]], dtype=int64)

In [61]:
#accuracy , recall and precision

In [62]:
from sklearn.metrics import recall_score, precision_score

In [63]:
recall_score(y_test1,pred_y1)

0.03007518796992481

In [64]:
precision_score(y_test1,pred_y1)

0.5714285714285714