In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [25]:
df = pd.read_csv(
    '/Users/uditrawat/Desktop/CardioX/artifacts/data_ingestion/heart_2020_cleaned.csv')

In [26]:
df.drop(['AgeCategory', 'Race', 'GenHealth'], inplace=True, axis=1)

In [28]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,Yes,Yes,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,No,Yes,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,Yes,Yes,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,No,No,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,No,Yes,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,Yes,No,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,No,Yes,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,No,Yes,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,No,No,12.0,No,No,No


In [29]:
binary_cols = [
            'Smoking',        
            'AlcoholDrinking',
            'Stroke',      
            'DiffWalking',
            'Sex',     
            'Diabetic',
            'PhysicalActivity',
            'Asthma',      
            'KidneyDisease', 
            'SkinCancer',
            'HeartDisease'
        ]

label_encoder = LabelEncoder()
for col in binary_cols:
    df[col] = label_encoder.fit_transform(df[col])


In [30]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3.0,30.0,0,0,2,1,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,0,1,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,2,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,0,0,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,0,1,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,1,2,0,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,1,0,1,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,0,0,1,6.0,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,0,0,0,12.0,0,0,0


In [32]:
df.corr()['HeartDisease'].sort_values(ascending=False)

HeartDisease        1.000000
DiffWalking         0.201258
Stroke              0.196835
PhysicalHealth      0.170721
Diabetic            0.168553
KidneyDisease       0.145197
Smoking             0.107764
SkinCancer          0.093317
Sex                 0.070040
BMI                 0.051803
Asthma              0.041444
MentalHealth        0.028591
SleepTime           0.008327
AlcoholDrinking    -0.032080
PhysicalActivity   -0.100030
Name: HeartDisease, dtype: float64

In [4]:
df['HeartDisease'].value_counts()

HeartDisease
No     292422
Yes     27373
Name: count, dtype: int64

In [5]:
no_heart_disease = df[df['HeartDisease'] == 'No']
yes_heart_disease = df[df['HeartDisease'] == 'Yes']

no_to_keep = len(yes_heart_disease)

no_undersampled = no_heart_disease.sample(n=no_to_keep, random_state=42)

df_balanced = pd.concat([no_undersampled, yes_heart_disease])

# Shuffle the dataset to mix the classes well
df_balanced = df_balanced.sample(
    frac=1, random_state=42).reset_index(drop=True)

# Check the new distribution
print(df_balanced['HeartDisease'].value_counts())

HeartDisease
No     27373
Yes    27373
Name: count, dtype: int64


In [6]:
train, test = train_test_split(df_balanced, test_size=0.20, random_state=42, stratify=df_balanced['HeartDisease']) 

In [7]:
X_train, y_train = train.drop('HeartDisease', axis=1), train['HeartDisease']
X_test, y_test = test.drop('HeartDisease', axis=1), test['HeartDisease']

In [8]:
def transform_data(X_train, X_test):
        # Apply label encoding to binary columns
        X_train, X_test = label_encoding(X_train, X_test)

        # Apply standard scaling to continuous columns
        X_train, X_test = standard_scaling(X_train, X_test)

        return X_train, X_test

def label_encoding( X_train, X_test):
        # Binary columns to encode
        binary_cols = [
            'Smoking',        
            'AlcoholDrinking',
            'Stroke',      
            'DiffWalking',
            'Sex',     
            'Diabetic',
            'PhysicalActivity',
            'Asthma',      
            'KidneyDisease', 
            'SkinCancer'     
        ]

        label_encoder = LabelEncoder()
        for col in binary_cols:
            X_train[col] = label_encoder.fit_transform(X_train[col])
            X_test[col] = label_encoder.transform(X_test[col])  # Apply the same transformation on test set

        return X_train, X_test

def standard_scaling( X_train, X_test):
        # Ensure train and test sets have the same columns
        X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

        # Standardize continuous columns
        continuous_cols = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
        scaler = StandardScaler()

        X_train[continuous_cols] = scaler.fit_transform(X_train[continuous_cols])
        X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])

        return X_train, X_test

In [9]:
X_train, X_test = label_encoding(X_train, X_test)

In [10]:
X_train

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
32168,32.41,0,0,0,0.0,0.0,0,1,0,1,8.0,0,0,0
7619,25.02,1,0,1,30.0,30.0,1,0,2,0,6.0,0,0,0
44574,34.96,1,0,0,30.0,0.0,0,1,2,0,6.0,1,0,0
12342,31.47,0,0,0,0.0,0.0,1,0,0,0,7.0,0,0,0
22978,35.73,1,0,0,0.0,0.0,1,1,2,1,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526,27.37,1,0,0,0.0,0.0,0,1,0,1,9.0,0,0,0
41670,44.62,0,0,0,0.0,0.0,0,1,0,1,9.0,0,0,0
4183,24.91,0,0,0,30.0,15.0,0,1,0,0,8.0,0,1,0
18608,41.57,1,0,1,0.0,0.0,1,0,2,0,6.0,0,0,0


In [11]:
X_test

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
5167,19.94,1,0,0,0.0,0.0,0,0,0,1,8.0,1,0,0
46095,35.57,0,0,1,30.0,0.0,0,1,0,1,8.0,0,0,0
48134,25.73,1,0,0,0.0,0.0,0,1,2,1,7.0,0,0,0
5179,18.60,0,0,0,0.0,0.0,0,0,0,1,7.0,0,0,0
1155,25.70,1,0,0,0.0,0.0,0,1,2,1,6.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4544,30.27,1,0,0,0.0,0.0,0,1,0,1,8.0,0,0,0
27572,18.29,1,0,0,0.0,0.0,1,0,2,0,6.0,1,0,0
46647,51.21,0,0,0,5.0,2.0,1,0,2,1,2.0,0,0,0
15998,21.29,0,0,0,0.0,0.0,0,1,0,1,8.0,0,0,0


In [15]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=5,
                            min_samples_leaf=5, min_samples_split=4)


In [16]:
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)

In [17]:
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.69      0.75      0.72      5475
         Yes       0.73      0.65      0.69      5475

    accuracy                           0.70     10950
   macro avg       0.71      0.70      0.70     10950
weighted avg       0.71      0.70      0.70     10950



In [50]:
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.68      0.75      0.72      5475
         Yes       0.73      0.65      0.69      5475

    accuracy                           0.70     10950
   macro avg       0.70      0.70      0.70     10950
weighted avg       0.70      0.70      0.70     10950



In [31]:
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.82      0.77      0.79     10949
         Yes       0.58      0.65      0.62      5475

    accuracy                           0.73     16424
   macro avg       0.70      0.71      0.70     16424
weighted avg       0.74      0.73      0.73     16424



In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],

    'min_samples_split': [2, 5, 10],

    'min_samples_leaf': [1, 2, 4],

    'max_features': ['sqrt', 'log2'],

    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}


rf = RandomForestClassifier(random_state=42)


grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)


grid_search.fit(X_train, y_train)


print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.6s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.5s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.6s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.7s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.8s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.2s



[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  14.5s
[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  30.9s
[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  13.6s
[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  13.8s
[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  30.8s
[CV] END bootstrap=False, class_weight=None, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  14.1s
[CV] END bootstrap=False, class_weight=None, max_dep

  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'bootstrap': True, 'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best accuracy score:  0.7470583758276886


In [35]:
grid_search.best_params_

{'bootstrap': True,
 'class_weight': None,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 100}

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=10,
                            min_samples_leaf=2, min_samples_split=10, class_weight='balanced',bootstrap=True,max_features='sqrt')

In [19]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [20]:
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.70      0.74      0.72      5475
         Yes       0.72      0.68      0.70      5475

    accuracy                           0.71     10950
   macro avg       0.71      0.71      0.71     10950
weighted avg       0.71      0.71      0.71     10950



In [53]:
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.70      0.74      0.72      5475
         Yes       0.72      0.68      0.70      5475

    accuracy                           0.71     10950
   macro avg       0.71      0.71      0.71     10950
weighted avg       0.71      0.71      0.71     10950



In [21]:
rfc = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.69      0.74      0.72      5475
         Yes       0.72      0.67      0.70      5475

    accuracy                           0.71     10950
   macro avg       0.71      0.71      0.71     10950
weighted avg       0.71      0.71      0.71     10950



In [54]:
rfc = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.70      0.74      0.72      5475
         Yes       0.72      0.68      0.70      5475

    accuracy                           0.71     10950
   macro avg       0.71      0.71      0.71     10950
weighted avg       0.71      0.71      0.71     10950



In [38]:
# rfc weighted
print("classification report", '\n', classification_report(y_test, rf_pred))

classification report 
               precision    recall  f1-score   support

          No       0.82      0.75      0.79     10949
         Yes       0.58      0.67      0.62      5475

    accuracy                           0.73     16424
   macro avg       0.70      0.71      0.70     16424
weighted avg       0.74      0.73      0.73     16424



In [22]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("classification report", '\n', classification_report(y_test, lr_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


classification report 
               precision    recall  f1-score   support

          No       0.68      0.77      0.72      5475
         Yes       0.74      0.64      0.68      5475

    accuracy                           0.71     10950
   macro avg       0.71      0.71      0.70     10950
weighted avg       0.71      0.71      0.70     10950



In [23]:
from sklearn.svm import SVC
sv = SVC()
sv.fit(X_train, y_train)
sv_pred = sv.predict(X_test)
print("classification report", '\n', classification_report(y_test, sv_pred))

classification report 
               precision    recall  f1-score   support

          No       0.66      0.77      0.71      5475
         Yes       0.73      0.60      0.66      5475

    accuracy                           0.69     10950
   macro avg       0.69      0.69      0.69     10950
weighted avg       0.69      0.69      0.69     10950



In [24]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
dtc_pred = sv.predict(X_test)
print("classification report", '\n', classification_report(y_test, dtc_pred))

classification report 
               precision    recall  f1-score   support

          No       0.66      0.77      0.71      5475
         Yes       0.73      0.60      0.66      5475

    accuracy                           0.69     10950
   macro avg       0.69      0.69      0.69     10950
weighted avg       0.69      0.69      0.69     10950

