# Data Preprocessing and Model Exploration 


## Imports

In [1]:
import warnings

import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")

In [2]:
# auto reload libs
%load_ext autoreload
%autoreload 2

## Paths

In [40]:
# https://www.kaggle.com/datasets/sukhmandeepsinghbrar/heart-attack-dataset/data
DATASET = "../data/Medicaldataset.csv"

## Load Data

In [41]:
org_df = pd.read_csv(DATASET)
org_df.head()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
0,63,1,66,160,83,160.0,1.8,0.012,negative
1,20,1,94,98,46,296.0,6.75,1.06,positive
2,56,1,64,160,77,270.0,1.99,0.003,negative
3,66,1,70,120,55,270.0,13.87,0.122,positive
4,54,1,64,112,65,300.0,1.08,0.003,negative


In [42]:
org_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1319 non-null   int64  
 1   Gender                    1319 non-null   int64  
 2   Heart rate                1319 non-null   int64  
 3   Systolic blood pressure   1319 non-null   int64  
 4   Diastolic blood pressure  1319 non-null   int64  
 5   Blood sugar               1319 non-null   float64
 6   CK-MB                     1319 non-null   float64
 7   Troponin                  1319 non-null   float64
 8   Result                    1319 non-null   object 
dtypes: float64(3), int64(5), object(1)
memory usage: 92.9+ KB


In [26]:
org_df.describe()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,56.193328,0.659591,78.336619,127.170584,72.269143,146.634344,15.274306,0.360942,0.614102
std,13.638173,0.474027,51.63027,26.12272,14.033924,74.923045,46.327083,1.154568,0.486991
min,14.0,0.0,20.0,42.0,38.0,35.0,0.321,0.001,0.0
25%,47.0,0.0,64.0,110.0,62.0,98.0,1.655,0.006,0.0
50%,58.0,1.0,74.0,124.0,72.0,116.0,2.85,0.014,1.0
75%,65.0,1.0,85.0,143.0,81.0,169.5,5.805,0.0855,1.0
max,103.0,1.0,1111.0,223.0,154.0,541.0,300.0,10.3,1.0


In [5]:
org_df.columns

Index(['Age', 'Gender', 'Heart rate', 'Systolic blood pressure',
       'Diastolic blood pressure', 'Blood sugar', 'CK-MB', 'Troponin',
       'Result'],
      dtype='object')

### Train, Validation, Test Split

In [15]:
from sklearn.model_selection import train_test_split

In [20]:
org_df.Result = org_df.Result.map({"negative": 0, "positive": 1})

In [21]:
X = org_df.drop("Result", axis=1)
y = org_df["Result"]

In [22]:
y.sum(), y[y == 0].shape

(np.int64(810), (509,))

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [24]:
X_train.shape  # , X_val.shape, X_test.shape

(1187, 8)

## Data Cleaning and pre-processing

### Correlation Metrix calculation

In [25]:
corr_matrix = org_df.corr()
corr_matrix["Result"].sort_values(ascending=False)

Result                      1.000000
Age                         0.238002
Troponin                    0.229376
CK-MB                       0.217720
Gender                      0.094432
Heart rate                  0.006920
Diastolic blood pressure   -0.009659
Systolic blood pressure    -0.020825
Blood sugar                -0.033059
Name: Result, dtype: float64

In [27]:
continues_col = ["Age", "Heart rate", "Systolic blood pressure", "Diastolic blood pressure", "Blood sugar", "CK-MB", "Troponin"]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OrdinalEncoder


def create_preprocessor(continuous_cols: list) -> ColumnTransformer:
    continues_xformer = StandardScaler()

    preprocessor = ColumnTransformer(
        transformers=[
            ("cont", continues_xformer, continuous_cols),
        ],
        remainder="passthrough",  # includes the rest of the columns
    )
    return preprocessor

In [29]:
preprocessor = create_preprocessor(continues_col)
preprocessor.fit(X_train)

In [30]:
X_train_processed = preprocessor.transform(X_train)

X_test_processed = preprocessor.transform(X_test)

In [31]:
X_train_processed.shape

(1187, 8)

### Initial model training

class weight calculation

In [32]:
total_0 = y_train[y_train == 0].shape[0]
total_1 = y_train[y_train == 1].shape[0]
total_samples = y_train.shape[0]
weight_for_0 = total_samples / (2 * total_0)
weight_for_1 = total_samples / (2 * total_1)

class_weights = {0: weight_for_0, 1: weight_for_1}
class_weights

{0: 1.2958515283842795, 1: 0.8141289437585734}

In [33]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from xgboost import XGBClassifier

SCORING = "roc_auc"


def find_intial_best_model():
    models = [
        ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
        ("AdaBoost Classifier", AdaBoostClassifier(random_state=42)),
        ("Random Forest", RandomForestClassifier(random_state=42, class_weight=class_weights)),
        ("XGboost Classifier", XGBClassifier(random_state=42)),
        ("Support Vector Machine", SVC(random_state=42)),
        ("Naye base Classifier", GaussianNB()),
    ]

    best_model = None
    best_score = 0.0
    # Iterate over the models and evaluate their performance
    for name, model in models:
        # create a pipeline for each model
        pipeline = Pipeline([("model", model)])

        # perform cross validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        mean_roc_auc = cross_val_score(model, X_train_processed, y_train, cv=cv, scoring=SCORING, n_jobs=-1).mean()

        # fit the pipeline on the training data
        pipeline.fit(X_train_processed, y_train)

        # make prediction on the test data
        y_pred = pipeline.predict(X_test_processed)

        # Calculate accuracy score
        score = roc_auc_score(y_test, y_pred)

        # print the performance metrics
        print("Model", name)
        print(f"Cross Validatino {SCORING}: ", mean_roc_auc)
        print("roc_auc_score: ", score)
        print()

        # Check if the current model has the best accuracy
        if score > best_score:
            best_score = score
            best_model = pipeline

    # Retrieve the best model
    print("Best Model: ", best_model)

### MLflow hyper param tuning

In [34]:
import mlflow
import optuna

optuna.logging.set_verbosity(optuna.logging.ERROR)

# mlflow.set_tracking_uri("http://localhost:5000")


def get_or_create_experiment(experiment_name) -> str:
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)


experiment_id = get_or_create_experiment("Finding the claassifier model")
mlflow.set_experiment(experiment_id=experiment_id)
experiment_id

'876583567520604079'

In [35]:
# https://mlflow.org/docs/latest/traditional-ml/hyperparameter-tuning-with-child-runs/notebooks/hyperparameter-tuning-with-child-runs#configure-the-tracking-server-uri

import xgboost as xgb
from sklearn.metrics import classification_report


# https://www.youtube.com/watch?v=E2b3SKMw934
def objective(trial) -> float:
    with mlflow.start_run(nested=True):
        # Add gradient-boosted models
        classifier = trial.suggest_categorical("classifier", ["XGBoost", "AdaBoost", "RandomForest"])

        if classifier == "XGBoost":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 1000, 50),
                "max_depth": trial.suggest_int("max_depth", 3, 30),
                "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),  # For imbalance
            }
            model = xgb.XGBClassifier(**params)

        elif classifier == "AdaBoost":
            params = {}
            params["n_estimators"] = trial.suggest_int("n_estimators", 100, 1000, 50)
            params["learning_rate"] = trial.suggest_float("learning_rate", 0.001, 0.3, log=True)
            model = AdaBoostClassifier(**params, random_state=42)

        elif classifier == "RandomForest":
            params = {}
            params["criterion"] = trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
            params["class_weight"] = trial.suggest_categorical("class_weight", ["balanced_subsample", class_weights])
            params["max_depth"] = trial.suggest_int("max_depth", 3, 30)
            params["min_samples_split"] = trial.suggest_int("min_samples_split", 2, 25)
            params["min_samples_leaf"] = trial.suggest_int("min_samples_leaf", 1, 25)
            params["bootstrap"] = trial.suggest_categorical("bootstrap", [True, False])
            params["max_features"] = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

            model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)

        # stratified K-fold for imbalance
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        score = cross_val_score(model, X_train_processed, y_train, cv=cv, scoring=SCORING, n_jobs=-1).mean()
        params["classifier"] = classifier
        mlflow.log_params(params)
        mlflow.log_metric(SCORING, score)

        # log classification report
        y_pred = model.fit(X_train_processed, y_train).predict(X_test_processed)
        report = classification_report(y_test, y_pred, output_dict=True)
        mlflow.log_dict(report, "classification_report")

    return score

In [36]:
import os

n_job = os.cpu_count() - 4
n_job

20

In [37]:
run_name = "Find the best model"
# Initiate the parent run and call the hyperparameter tuning child run logic
with mlflow.start_run(run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction="maximize")

    # Execute the hyperparameter optimization trials.
    study.optimize(objective, n_trials=250, n_jobs=n_job, show_progress_bar=True)
    # study.optimize(objective, n_trials=250, callbacks=[champion_callback], n_jobs=-1, show_progress_bar=True)

    mlflow.log_params(study.best_params)
    mlflow.log_metric(SCORING, study.best_value)

Best trial: 203. Best value: 0.997307: 100%|██████████| 250/250 [02:13<00:00,  1.87it/s]


In [38]:
# Retrieve the best trial
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial ruc score:", best_trial.value)

Best trial parameters: {'classifier': 'XGBoost', 'n_estimators': 750, 'max_depth': 24, 'learning_rate': 0.007659266705050152, 'subsample': 0.8638068341586023, 'scale_pos_weight': 3.0839016606865917}
Best trial ruc score: 0.997306711690068
