# IMPORTING PACKAGES

In [4]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [1]:
import warnings
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# ignore all deprecation warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)

# IMPORTING DATA-SET

In [10]:
data=pd.read_csv('RHF_Data.csv')

# UNDERSTANDING THE DATA 

In [11]:
print(data.shape)
print(data.size)

(355, 51)
18105


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 51 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Study Code                          355 non-null    object 
 1   Age                                 351 non-null    float64
 2   Gender                              351 non-null    float64
 3   Diagnosis                           353 non-null    float64
 4   Dyspnea                             355 non-null    int64  
 5   Palpitations                        355 non-null    int64  
 6   Leg Swelling                        353 non-null    float64
 7   Syncope                             355 non-null    int64  
 8   Chest Pain                          354 non-null    float64
 9   Pulmonary Edema                     355 non-null    int64  
 10  Creptations                         352 non-null    float64
 11  Hypertension                        353 non-n

In [13]:
data.describe()

Unnamed: 0,Age,Gender,Diagnosis,Dyspnea,Palpitations,Leg Swelling,Syncope,Chest Pain,Pulmonary Edema,Creptations,...,Atrial Fibrillation,Pulmonary Arterial Hypertension,Diastolic Dysfunction,Statins,Loop Diuretics,Potassium Sparing Diuretics,Beta Blockers,ACE Inhibitors,Calcium Channel Blockers,Angiotensin II receptor blockers
count,351.0,351.0,353.0,355.0,355.0,353.0,355.0,354.0,355.0,352.0,...,351.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0,355.0
mean,57.777778,1.367521,0.583569,0.833803,0.214085,0.175637,0.078873,0.460452,0.078873,0.517045,...,0.102564,0.233803,0.498592,0.653521,0.642254,0.521127,0.391549,0.242254,0.138028,0.109859
std,12.781981,0.482818,0.493666,0.372783,0.410765,0.381051,0.269921,0.709998,0.269921,0.500421,...,0.303822,0.423846,0.500704,0.47652,0.480014,0.500259,0.488786,0.429052,0.345416,0.313155
min,20.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,59.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,67.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
max,90.0,2.0,1.0,1.0,1.0,1.0,1.0,10.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#  INVESTIGATING IRELEVANT COLUMNS

In [14]:
col=['Study Code','In Hospital Mortality']
data=data.drop(col,1)

# DATA PRE-PROCESSING 

## CHECKING FOR NULL-VALUES

In [15]:
data.isnull().sum()

Age                                     4
Gender                                  4
Diagnosis                               2
Dyspnea                                 0
Palpitations                            0
Leg Swelling                            2
Syncope                                 0
Chest Pain                              1
Pulmonary Edema                         0
Creptations                             3
Hypertension                            2
Diabetes                                3
Smoking                                 6
CKDor Dialysis                          3
Alcohol                                 5
Cardiogenic Shock                       0
Myocardial Infarction                   0
RWMA                                  116
NYHA                                   35
Severe LV Dysfunction                   0
BMI                                   288
SBP                                    24
DBP                                    24
PR                                

## OUTLIER ANALYSIS 

In [16]:
z = np.abs(stats.zscore(data.loc[:,:]))
print(z)

[[       nan        nan        nan ... 0.5654225  0.40016337 0.35130858]
 [       nan        nan        nan ... 0.5654225  0.40016337 0.35130858]
 [       nan        nan        nan ... 0.5654225  0.40016337 0.35130858]
 ...
 [       nan        nan        nan ... 0.5654225  0.40016337 0.35130858]
 [       nan        nan        nan ... 0.5654225  0.40016337 0.35130858]
 [       nan        nan        nan ... 0.5654225  0.40016337 0.35130858]]


In [17]:
#Threshold to identify an outlier.
#The first array contains the list of row numbers and second array respective column numbers

threshold = 3
print(np.where(z > 3))

(array([ 15,  29,  30,  48,  49,  54,  56,  58,  65,  74,  77,  86, 110,
       117, 131, 144, 152, 157, 207, 236, 248, 252, 256, 262, 267, 274,
       276, 276, 277, 279, 283, 287, 290, 290, 291, 292, 293, 294, 296,
       297, 298, 303, 310, 311, 314, 318, 320, 321, 325, 330, 331, 334,
       334, 335, 345, 353], dtype=int64), array([8, 8, 8, 6, 6, 8, 8, 6, 8, 8, 8, 6, 8, 8, 8, 8, 6, 8, 6, 6, 8, 6,
       6, 8, 6, 8, 6, 8, 8, 8, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6,
       6, 8, 6, 8, 8, 6, 8, 6, 8, 6, 8, 8], dtype=int64))


In [18]:
#Data without outliers
odata = data[(z < 3).all(axis=1)]

In [19]:
print(data.shape)
print(odata.shape)

(355, 49)
(0, 49)


# EXPLORATORY DATA ANALYSIS

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 49 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 351 non-null    float64
 1   Gender                              351 non-null    float64
 2   Diagnosis                           353 non-null    float64
 3   Dyspnea                             355 non-null    int64  
 4   Palpitations                        355 non-null    int64  
 5   Leg Swelling                        353 non-null    float64
 6   Syncope                             355 non-null    int64  
 7   Chest Pain                          354 non-null    float64
 8   Pulmonary Edema                     355 non-null    int64  
 9   Creptations                         352 non-null    float64
 10  Hypertension                        353 non-null    float64
 11  Diabetes                            352 non-n

In [21]:
data.loc[:,'Final Outcome'].unique()

array([0, 1, 2], dtype=int64)

In [22]:
data.loc[:,'Final Outcome'].value_counts()

1    179
0    135
2     41
Name: Final Outcome, dtype: int64

In [23]:
columns=['CPK','CPK-Mb','FBS','RWMA']
fdata=data.drop(columns,1)

In [24]:
fdata_corr = fdata.corr()

In [25]:
abs(fdata_corr.loc[:,'Final Outcome'].sort_values())

Time to Event                         0.248691
Diastolic Dysfunction                 0.182482
Beta Blockers                         0.152097
BMI                                   0.137371
ACE Inhibitors                        0.133304
PCV                                   0.102485
Hb                                    0.080095
Statins                               0.077750
DBP                                   0.077729
SBP                                   0.076731
Sodium                                0.070115
Loop Diuretics                        0.068710
Potassium Sparing Diuretics           0.060622
Calcium Channel Blockers              0.050386
Alcohol                               0.032499
Severe LV Dysfunction                 0.031162
Disease Etiology                      0.030158
Length of Hospital Stay               0.027948
Gender                                0.013098
Pulmonary Arterial Hypertension       0.010431
Smoking                               0.001795
Leg Swelling 

In [26]:
sdata=fdata.loc[:,['Diastolic Dysfunction','Beta Blockers','BMI','ACE Inhibitors','PCV','Time to Event','Final Outcome']]

In [27]:
bdata=fdata.loc[:,['Age','Gender','Diagnosis','Chest Pain','Hypertension','Diabetes','Smoking','Alcohol','Time to Event','Blood Urea','Potassium','Myocardial Infarction','Total Count','Cardiogenic Shock','Final Outcome']]

In [28]:
sdata.to_csv('stat.csv',index=False)
bdata.to_csv('bi.csv',index=False)

# STATISTICAL DATA ANALYSIS AND MODELING

In [29]:
sdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Diastolic Dysfunction  355 non-null    int64  
 1   Beta Blockers          355 non-null    int64  
 2   BMI                    67 non-null     float64
 3   ACE Inhibitors         355 non-null    int64  
 4   PCV                    314 non-null    float64
 5   Time to Event          355 non-null    float64
 6   Final Outcome          355 non-null    int64  
dtypes: float64(3), int64(4)
memory usage: 19.5 KB


In [30]:
sdata.isnull().sum()

Diastolic Dysfunction      0
Beta Blockers              0
BMI                      288
ACE Inhibitors             0
PCV                       41
Time to Event              0
Final Outcome              0
dtype: int64

In [31]:
stats.skew(sdata)

array([0.00563383, 0.44438199,        nan, 1.20316648,        nan,
       0.76052122, 0.33094413])

In [32]:
sdata.dropna(inplace=True)

In [33]:
print(sdata.shape)
print(sdata.size)

(64, 7)
448


## MODELING 

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
X=sdata.loc[:,['Time to Event','PCV','ACE Inhibitors','BMI','Beta Blockers','Diastolic Dysfunction']]
y=sdata.loc[:,['Final Outcome']]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,random_state=13)

In [53]:
model=RandomForestClassifier(random_state=0)
model.fit(X_train,y_train.values.ravel())
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
(accuracy_score(y_test,y_pred))*100

[[6 1 1]
 [2 4 0]
 [1 0 1]]


68.75

In [61]:
from xgboost import XGBClassifier
model=XGBClassifier()
model.fit(X_train,y_train.values.ravel())
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
pd.crosstab(y_test.values.ravel(),y_pred)
accuracy_score(y_test,y_pred)*100

[[6 2 0]
 [1 5 0]
 [1 0 1]]


75.0

# BI DATA ANALYSIS AND MODELING

In [62]:
bdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    351 non-null    float64
 1   Gender                 351 non-null    float64
 2   Diagnosis              353 non-null    float64
 3   Chest Pain             354 non-null    float64
 4   Hypertension           353 non-null    float64
 5   Diabetes               352 non-null    float64
 6   Smoking                349 non-null    float64
 7   Alcohol                350 non-null    float64
 8   Time to Event          355 non-null    float64
 9   Blood Urea             331 non-null    float64
 10  Potassium              259 non-null    float64
 11  Myocardial Infarction  355 non-null    int64  
 12  Total Count            319 non-null    float64
 13  Cardiogenic Shock      355 non-null    int64  
 14  Final Outcome          355 non-null    int64  
dtypes: flo

In [63]:
bdata.isnull().sum()

Age                       4
Gender                    4
Diagnosis                 2
Chest Pain                1
Hypertension              2
Diabetes                  3
Smoking                   6
Alcohol                   5
Time to Event             0
Blood Urea               24
Potassium                96
Myocardial Infarction     0
Total Count              36
Cardiogenic Shock         0
Final Outcome             0
dtype: int64

In [64]:
bdata.dropna(inplace=True)

In [65]:
bdata.isnull().sum()

Age                      0
Gender                   0
Diagnosis                0
Chest Pain               0
Hypertension             0
Diabetes                 0
Smoking                  0
Alcohol                  0
Time to Event            0
Blood Urea               0
Potassium                0
Myocardial Infarction    0
Total Count              0
Cardiogenic Shock        0
Final Outcome            0
dtype: int64

## MODELING

In [66]:
X=bdata.loc[:,['Age','Gender','Diagnosis','Chest Pain','Hypertension','Diabetes','Smoking','Alcohol','Time to Event']]
y=bdata.loc[:,['Final Outcome']]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,random_state=13)

In [40]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [68]:
model=RandomForestClassifier(random_state=1)
model.fit(X_train,y_train.values.ravel())
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
accuracy_score(y_test,y_pred)*100

[[16  3  0]
 [ 6 27  0]
 [ 4  1  2]]


76.27118644067797

In [69]:
from xgboost import XGBClassifier
model=XGBClassifier()
model.fit(X_train,y_train.values.ravel())
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
accuracy_score(y_test,y_pred)*100

[[15  2  2]
 [ 4 29  0]
 [ 6  1  0]]


74.57627118644068

In [70]:
X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=0)

In [71]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()

In [83]:
#model = Random Forest(max_depth,max_leaf_nodes,n_estimator)
#GRIDSEARCHCV
parameters = {'max_depth': [40,41,42], 'max_leaf_nodes': [20,25],
                     'n_estimators': [117,120]}

grid_model = GridSearchCV(RandomForestClassifier(), parameters )
grid_model.fit(X_train, y_train.values.ravel())
print(grid_model.best_score_)
print(grid_model.best_params_)
y_predict = grid_model.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))

0.7239263803680982
{'max_depth': 40, 'max_leaf_nodes': 25, 'n_estimators': 120}
0.7605633802816901
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        23
           1       0.84      0.83      0.84        46
           2       0.00      0.00      0.00         2

    accuracy                           0.76        71
   macro avg       0.51      0.51      0.51        71
weighted avg       0.77      0.76      0.77        71

[[16  6  1]
 [ 6 38  2]
 [ 1  1  0]]


In [87]:
#RANDOMIZEDSEARCHCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 25]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
grid_model = RandomizedSearchCV(RandomForestClassifier(),random_grid ,random_state=10)
grid_model.fit(X_train, y_train.values.ravel())
print(grid_model.best_score_)
print(grid_model.best_params_)
y_predict = grid_model.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))

0.7116564417177914
{'n_estimators': 1115, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': False}
0.7887323943661971
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        23
           1       0.83      0.87      0.85        46
           2       0.00      0.00      0.00         2

    accuracy                           0.79        71
   macro avg       0.51      0.52      0.52        71
weighted avg       0.77      0.79      0.78        71

[[16  7  0]
 [ 6 40  0]
 [ 1  1  0]]


In [88]:
#Grid search on XGBOOST
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

#model = XGBOOST(kernel='rbf',C=1,gamma =.5)

params = {
        'min_child_weight': [1, 5, 10,15,20],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.7,0.8,0.9, 1.0],
        'max_depth': [3, 4, 5]
        }
                    
grid_model = GridSearchCV(XGBClassifier(), params)
grid_model.fit(X_train, y_train.values.ravel())
print(grid_model.best_score_)
print(grid_model.best_params_)
y_predict = grid_model.predict(X_test)
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
print(confusion_matrix(y_test,y_predict))

0.7361963190184049
{'colsample_bytree': 0.6, 'gamma': 2, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.8}
0.8169014084507042
              precision    recall  f1-score   support

           0       0.67      0.87      0.75        23
           1       0.93      0.83      0.87        46
           2       0.00      0.00      0.00         2

    accuracy                           0.82        71
   macro avg       0.53      0.57      0.54        71
weighted avg       0.82      0.82      0.81        71

[[20  3  0]
 [ 8 38  0]
 [ 2  0  0]]
