## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from ngboost import NGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

## Load the Data

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv(url, names=columns)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
data.to_csv("Cleveland Heart Disease Dataset.csv",index=False) # Save data to local machine

## Data Preprocessing

In [4]:
data = pd.read_csv("Cleveland Heart Disease Dataset.csv")
print(len(data))
data.head()

303


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [6]:
# Simple code to display distinct values and their count for each column in the DataFrame
for col in data.columns:
    unique_values = data[col].unique()
    unique_count = len(unique_values)
    print(f"Column: {col}")
    print(f"Distinct Values: {unique_values}")
    print(f"Count of Unique Values: {unique_count}\n")

Column: age
Distinct Values: [63. 67. 37. 41. 56. 62. 57. 53. 44. 52. 48. 54. 49. 64. 58. 60. 50. 66.
 43. 40. 69. 59. 42. 55. 61. 65. 71. 51. 46. 45. 39. 68. 47. 34. 35. 29.
 70. 77. 38. 74. 76.]
Count of Unique Values: 41

Column: sex
Distinct Values: [1. 0.]
Count of Unique Values: 2

Column: cp
Distinct Values: [1. 4. 3. 2.]
Count of Unique Values: 4

Column: trestbps
Distinct Values: [145. 160. 120. 130. 140. 172. 150. 110. 132. 117. 135. 112. 105. 124.
 125. 142. 128. 170. 155. 104. 180. 138. 108. 134. 122. 115. 118. 100.
 200.  94. 165. 102. 152. 101. 126. 174. 148. 178. 158. 192. 129. 144.
 123. 136. 146. 106. 156. 154. 114. 164.]
Count of Unique Values: 50

Column: chol
Distinct Values: [233. 286. 229. 250. 204. 236. 268. 354. 254. 203. 192. 294. 256. 263.
 199. 168. 239. 275. 266. 211. 283. 284. 224. 206. 219. 340. 226. 247.
 167. 230. 335. 234. 177. 276. 353. 243. 225. 302. 212. 330. 175. 417.
 197. 198. 290. 253. 172. 273. 213. 305. 216. 304. 188. 282. 185. 232.
 326. 231. 

### 1. Handeling records with invalid data

In [7]:
# Count occurrences of '?' in 'ca' and 'thal' column
count_question_mark = data[['ca', 'thal']].apply(lambda col: (col == '?').sum())
print(count_question_mark) # Display the result

ca      4
thal    2
dtype: int64


In [8]:
data.replace('?', np.nan, inplace=True)  # Replace "?" with NaN
data.dropna(inplace=True)
print(len(data))
data.head()

297


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


### 2. Convert Target Variable into Binary variable

In [9]:
data['target'] = data['target'].apply(lambda x: 1 if x > 0 else 0)
print(len(data))
data.head()

297


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


### 3. Splitting features and target

In [10]:
# Splitting features and target
X = data.drop(columns='target')
y = data['target']

### 4. Feature Scaling for Numerical attributes

In [11]:
from sklearn.preprocessing import StandardScaler

# Define the numerical columns you want to normalize
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Initialize the scaler
scaler = StandardScaler()

# Apply z-score normalization only to the numerical columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Print the transformed data
print(len(X))
X.head()

297


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0.936181,1.0,1.0,0.75038,-0.276443,1.0,2.0,0.017494,0.0,1.068965,3.0,0.0,6.0
1,1.378929,1.0,4.0,1.596266,0.744555,0.0,2.0,-1.816334,1.0,0.381773,2.0,3.0,3.0
2,1.378929,1.0,4.0,-0.659431,-0.3535,0.0,2.0,-0.89942,1.0,1.326662,2.0,2.0,7.0
3,-1.94168,1.0,3.0,-0.095506,0.051047,0.0,0.0,1.63301,0.0,2.099753,3.0,0.0,3.0
4,-1.498933,0.0,2.0,-0.095506,-0.835103,0.0,2.0,0.978071,0.0,0.295874,1.0,0.0,3.0


### 5. One hot Encoding for Categorical Variable

In [12]:
# Apply One-Hot Encoding to all categorical columns
X = pd.get_dummies(X, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])
print(len(X))
X.head()

297


Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0.0,sex_1.0,cp_1.0,cp_2.0,cp_3.0,...,slope_1.0,slope_2.0,slope_3.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0,thal_3.0,thal_6.0,thal_7.0
0,0.936181,0.75038,-0.276443,0.017494,1.068965,False,True,True,False,False,...,False,False,True,True,False,False,False,False,True,False
1,1.378929,1.596266,0.744555,-1.816334,0.381773,False,True,False,False,False,...,False,True,False,False,False,False,True,True,False,False
2,1.378929,-0.659431,-0.3535,-0.89942,1.326662,False,True,False,False,False,...,False,True,False,False,False,True,False,False,False,True
3,-1.94168,-0.095506,0.051047,1.63301,2.099753,False,True,False,False,True,...,False,False,True,True,False,False,False,True,False,False
4,-1.498933,-0.095506,-0.835103,0.978071,0.295874,True,False,False,True,False,...,True,False,False,True,False,False,False,True,False,False


### 4. Splitting the Data into Train and Test set

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Ensembles Methods

### 1. Random Forest

In [14]:
rf_model = RandomForestClassifier(
    n_estimators=100,            # Number of trees
    max_depth=10,                # Maximum depth of each tree
    min_samples_split=5,         # Minimum samples to split a node
    min_samples_leaf=2,          # Minimum samples in a leaf node
    max_features='sqrt',         # Number of features to consider for each split
    bootstrap=True,              # Use bootstrapping
    random_state=42              # Set seed for reproducibility
)

In [15]:
# Train the model
rf_model.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

In [17]:
y_pred

array([0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1], dtype=int64)

In [18]:
y_pred_proba

array([0.09353609, 0.53852525, 0.02290866, 0.74585714, 0.04650831,
       0.27816414, 0.35469444, 0.57758045, 0.75378571, 0.18096429,
       0.39777381, 0.07269784, 0.06031342, 0.58858478, 0.3405754 ,
       0.02037736, 0.35210134, 0.55256527, 0.64360714, 0.26478319,
       0.73123918, 0.6795    , 0.8413544 , 0.25553053, 0.89905556,
       0.20626377, 0.90869444, 0.57630678, 0.12928945, 0.02670974,
       0.14285531, 0.05386104, 0.98166667, 0.14642922, 0.43293254,
       0.25428571, 0.6827588 , 0.10822524, 0.87957143, 0.66784524,
       0.06954403, 0.96450198, 0.92816667, 0.55720743, 0.01204403,
       0.44963492, 0.46278716, 0.63211905, 0.41350541, 0.0268428 ,
       0.12839042, 0.86471429, 0.30319048, 0.18278573, 0.80223214,
       0.6496163 , 0.04030471, 0.19097367, 0.55210714, 0.71823413])

In [19]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("accuracy :",accuracy)
print("Precission :",precision)
print("Recall :",recall)
print("f1-Score :",f1)

accuracy : 0.8666666666666667
Precission : 0.8076923076923077
Recall : 0.875
f1-Score : 0.84


### 2. Extra Random Tree

In [20]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, f1_score

In [21]:
# Define Extra Trees model
extra_trees_model = ExtraTreesClassifier(
    n_estimators=100,        # Number of trees in the forest
    max_depth=None,          # Maximum depth of each tree; None allows full depth
    min_samples_split=2,     # Minimum samples required to split a node
    min_samples_leaf=1,      # Minimum samples required in a leaf node
    #max_features= ,     # Number of features to consider for best split
    bootstrap=False,         # No replacement sampling by default
    random_state=42          # Seed for reproducibility
)

In [22]:
# Train the model
extra_trees_model.fit(X_train, y_train)

In [23]:
# Make predictions
y_pred_etr = extra_trees_model.predict(X_test)
y_pred_proba_etr = extra_trees_model.predict_proba(X_test)[:, 1]

In [24]:
# Evaluate model performance
accuracy_etr = accuracy_score(y_test, y_pred_etr)
auc_roc_etr = roc_auc_score(y_test, y_pred_proba_etr)
precision_etr = precision_score(y_test, y_pred_etr)
recall_etr = recall_score(y_test, y_pred_etr)
f1_etr = f1_score(y_test, y_pred_etr)

print(f"Extra Trees performance metric:" )
print("accuracy :",accuracy_etr)
print("Precission :",precision_etr)
print("Recall :",recall_etr)
print("f1-Score :",f1_etr)
print("AUC-Score :",auc_roc_etr)

Extra Trees performance metric:
accuracy : 0.85
Precission : 0.7777777777777778
Recall : 0.875
f1-Score : 0.8235294117647058
ROC-Score : 0.9618055555555556


### 3. XGBoost

In [25]:
xgb_model = XGBClassifier(
    n_estimators=100,               # Number of boosting rounds
    learning_rate=0.1,              # Step size shrinkage
    max_depth=6,                    # Maximum tree depth
    subsample=0.8,                  # Fraction of samples used per tree
    colsample_bytree=0.8,           # Fraction of features used per tree
    gamma=1,                        # Minimum loss reduction to make a split
    use_label_encoder=False,        # Disable the use of the label encoder for warnings

     eval_metric='logloss',          # Evaluation metric for binary classification
    random_state=42                 # Set seed for reproducibility
)

In [26]:
# Train the model
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [27]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

In [28]:
# Evaluate model performance
accuracy_xgb = accuracy_score(y_test, y_pred)
auc_roc_xgb = roc_auc_score(y_test, y_pred_proba)
precision_xgb = precision_score(y_test, y_pred)
recall_xgb = recall_score(y_test, y_pred)
f1_xgb = f1_score(y_test, y_pred)

print(f"XGBoost performance metric:" )
print("accuracy :",accuracy_xgb)
print("Precission :",precision_xgb)
print("Recall :",recall_xgb)
print("f1-Score :",f1_xgb)
print("AUC-Score :",auc_roc_xgb)

XGBoost performance metric:
accuracy : 0.8666666666666667
Precission : 0.8076923076923077
Recall : 0.875
f1-Score : 0.84
ROC-Score : 0.9490740740740741


### 4. NGBoost

In [29]:
from ngboost.distns import Bernoulli  # Distribution for binary classification
from sklearn.tree import DecisionTreeRegressor
from ngboost.scores import LogScore  # Scoring function

In [30]:
ngb_model = NGBClassifier(
    Dist=Bernoulli,                     # Distribution for binary classification
    Score=LogScore,                      # Scoring function
    Base=DecisionTreeRegressor(max_depth=3),  # Base learner
    n_estimators=100,                    # Number of boosting iterations
    learning_rate=0.01,                  # Learning rate
    minibatch_frac=0.8,                  # Fraction of samples per boosting round
    natural_gradient=True,               # Use natural gradients
    random_state=42                      # Set seed for reproducibility
)


In [31]:
# Train the model
ngb_model.fit(X_train, y_train)

[iter 0] loss=0.6900 val_loss=0.0000 scale=4.0000 norm=7.9838


In [32]:
# Make predictions
y_pred = ngb_model.predict(X_test)
y_pred_proba =ngb_model.predict_proba(X_test)[:, 1]

In [33]:
# Evaluate model performance
accuracy_ngb = accuracy_score(y_test, y_pred)
auc_roc_ngb = roc_auc_score(y_test, y_pred_proba)
precision_ngb = precision_score(y_test, y_pred)
recall_ngb = recall_score(y_test, y_pred)
f1_ngb = f1_score(y_test, y_pred)

print(f"XGBoost performance metric:" )
print("accuracy :",accuracy_ngb)
print("Precission :",precision_ngb)
print("Recall :",recall_ngb)
print("f1-Score :",f1_ngb)
print("AUC-Score :",auc_roc_ngb)

XGBoost performance metric:
accuracy : 0.8833333333333333
Precission : 0.84
Recall : 0.875
f1-Score : 0.8571428571428571
ROC-Score : 0.9409722222222222


### 5. Adaboost

In [34]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, f1_score

In [35]:
# Define AdaBoost model with a decision tree as the base estimator
ada_model = AdaBoostClassifier(
    #base_estimator=DecisionTreeClassifier(max_depth=3),  # Base learner
    n_estimators=100,                                    # Number of boosting iterations
    learning_rate=0.01,                                  # Learning rate
    random_state=42                                      # Set seed for reproducibility
)

In [36]:
# Train the model
ada_model.fit(X_train, y_train)



In [37]:
# Make predictions
y_pred_ada = ada_model.predict(X_test)
y_pred_proba_ada = ada_model.predict_proba(X_test)[:, 1]

In [38]:
# Evaluate model performance
accuracy_adb = accuracy_score(y_test, y_pred_ada)
auc_roc_adb = roc_auc_score(y_test, y_pred_proba_ada)
precision_adb = precision_score(y_test, y_pred_ada)
recall_adb = precision_score(y_test, y_pred_ada)
f1_adb = f1_score(y_test, y_pred_ada)


print(f"AdaBoost performance metric:" )
print("accuracy :",accuracy_adb)
print("Precission :",precision_adb)
print("Recall :",recall_adb)
print("f1-Score :",f1_adb)
print("AUC-Score :",auc_roc_adb)

AdaBoost performance metric:
accuracy : 0.9
Precission : 0.875
Recall : 0.875
f1-Score : 0.875
ROC-Score : 0.931712962962963


### 6. LightGBM

In [39]:
#!pip install lightgbm

In [40]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, f1_score

In [41]:
# Define LightGBM model
lgbm_model = LGBMClassifier(
    n_estimators=100,              # Number of boosting iterations
    learning_rate=0.01,            # Learning rate
    subsample=0.8,                 # Fraction of samples per boosting round
    colsample_bytree=0.8,          # Fraction of features per boosting round
    random_state=42                # Set seed for reproducibility
)

In [42]:
# Train the model
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 113, number of negative: 124
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 249
[LightGBM] [Info] Number of data points in the train set: 237, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476793 -> initscore=-0.092894
[LightGBM] [Info] Start training from score -0.092894


In [43]:
# Make predictions
y_pred_lgbm = lgbm_model.predict(X_test)
y_pred_proba_lgbm = lgbm_model.predict_proba(X_test)[:, 1]

In [44]:
# Evaluate model performance
accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
auc_roc_lgbm = roc_auc_score(y_test, y_pred_proba_lgbm)
precision_lgbm = precision_score(y_test, y_pred_lgbm)
recall_lgbm = recall_score(y_test, y_pred_lgbm)
f1_lgbm = f1_score(y_test, y_pred_lgbm)


print(f"LightGBM performance metric:" )
print("accuracy :",accuracy_lgbm)
print("Precission :",precision_lgbm)
print("Recall :",recall_lgbm)
print("f1-Score :",f1_lgbm)
print("AUC-Score :",auc_roc_lgbm)

LightGBM performance metric:
accuracy : 0.8666666666666667
Precission : 0.8076923076923077
Recall : 0.875
f1-Score : 0.84
ROC-Score : 0.9398148148148148


### Comparative analysis

In [45]:
model_name_list = ['Random Forest','Extra Random Tree','XGBoost','NGBoost','Adaboost','LightGBM']
models_list = [rf_model, extra_trees_model,xgb_model,ngb_model,ada_model,lgbm_model]

In [49]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

metrics_list = []

for model_name, model in zip(model_name_list, models_list):
    y_pred = model.predict(X_test)
    
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_pred)
    
    metrics_list.append({
        'Model': model_name,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'F1 Score': f1,
        'AUC Score': roc,
        
    })

metrics_df = pd.DataFrame(metrics_list)

metrics_df

Unnamed: 0,Model,Precision,Recall,Accuracy,F1 Score,AUC Score
0,Random Forest,0.870136,0.866667,0.866667,0.867429,0.868056
1,Extra Random Tree,0.856566,0.85,0.85,0.851151,0.868056
2,XGBoost,0.851429,0.85,0.85,0.850474,0.868056
3,NGBoost,0.884571,0.883333,0.883333,0.883702,0.868056
4,Adaboost,0.9,0.9,0.9,0.9,0.868056
5,LightGBM,0.870136,0.866667,0.866667,0.867429,0.868056
