In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve
import matplotlib.pyplot as plt
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv(
    '/Users/uditrawat/Desktop/CardioX/artifacts/data_ingestion/heart_2020_cleaned.csv')

df.drop(['AgeCategory', 'Race', 'GenHealth'], inplace=True, axis=1)

In [3]:
# Step 1: Preprocess the data (split, encode, etc.)
X = df.drop(columns=['HeartDisease'],axis=1)
y = df['HeartDisease']

In [15]:
data = pd.read_csv(
    '/Users/uditrawat/Desktop/CardioX/artifacts/data_transformation/test.csv')
X_test = data.drop(columns=['HeartDisease'])
y_test = data['HeartDisease']
data_s = pd.read_csv(
    '/Users/uditrawat/Desktop/CardioX/artifacts/data_transformation/train.csv')
X_train = data_s.drop(columns=['HeartDisease'])
y_train = data_s['HeartDisease']

In [16]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

In [4]:
# Convert target to binary labels
y = y.apply(lambda x: 1 if x == 'Yes' else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Apply Label Encoding (if needed)
le = LabelEncoder()
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

# Step 2: Feature Selection using Chi-Square
chi_selector = SelectKBest(chi2, k=6)
X_train_top6 = chi_selector.fit_transform(X_train, y_train)
X_test_top6 = chi_selector.transform(X_test)

top6_features = X_train.columns[chi_selector.get_support()]
print(f"Top 5 selected features: {list(top6_features)}")

Top 5 selected features: ['Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Diabetic', 'KidneyDisease']


In [7]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'scale_pos_weight': [scale_pos_weight_xgb * 0.8, scale_pos_weight_xgb, scale_pos_weight_xgb * 1.2],
    'learning_rate': [0.01, 0.05, 0.1],   # Step size shrinkage
    'n_estimators': [100, 200, 300],       # Number of trees
    'max_depth': [3, 5, 7],                # Maximum tree depth
    # Minimum sum of instance weight needed in a child
    'min_child_weight': [1, 3, 5],
    # Minimum loss reduction required for a split
    'gamma': [0, 0.1, 0.2],
    # Fraction of samples used for training each tree
    'subsample': [0.8, 1.0],
    # Fraction of features used for each tree
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Perform Randomized Search with cross-validation
random_search_xgb = RandomizedSearchCV(
    xgb_model, param_grid_xgb, cv=5, n_iter=50, scoring='roc_auc', random_state=42, n_jobs=-1)

# Fit the RandomizedSearchCV on training data
random_search_xgb.fit(X_train_top6, y_train)

# Best parameters from random search
print(f"Best parameters for XGBoost: {random_search_xgb.best_params_}")

Best parameters for XGBoost: {'subsample': 0.8, 'scale_pos_weight': 10.683076080007307, 'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 1.0}


In [16]:
random_search_xgb.best_params_

({'subsample': 1.0,
  'scale_pos_weight': 12.819691296008768,
  'reg_lambda': 0.1,
  'reg_alpha': 0.1,
  'num_leaves': 31,
  'n_estimators': 200,
  'min_child_weight': 3,
  'max_depth': 3,
  'learning_rate': 0.05,
  'colsample_bytree': 0.8},
 '\n',
 {'subsample': 0.8,
  'scale_pos_weight': 10.683076080007307,
  'n_estimators': 100,
  'min_child_weight': 5,
  'max_depth': 3,
  'learning_rate': 0.1,
  'gamma': 0.1,
  'colsample_bytree': 1.0})

In [22]:
# Use only XGBoost model for prediction
xgb_model_ex = XGBClassifier(**random_search_xgb, random_state=42)

# Fit the XGBoost model
xgb_model_ex.fit(X_train_top6, y_train)

# Make predictions using XGBoost
y_pred_xgb_proba = xgb_model_ex.predict_proba(X_test_top6)[:, 1]

# Convert probabilities to binary predictions based on a threshold (0.5)
y_pred_xgb = (y_pred_xgb_proba >= 0.5).astype(int)

# Step 5: Evaluate the XGBoost model using classification report
print("Classification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb))


Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     58484
           1       0.20      0.63      0.30      5475

    accuracy                           0.75     63959
   macro avg       0.58      0.69      0.57     63959
weighted avg       0.89      0.75      0.80     63959



In [18]:
random_search_xgb = {'subsample': 0.8,
  'scale_pos_weight': 10.683076080007307,
  'n_estimators': 100,
  'min_child_weight': 5,
  'max_depth': 3,
  'learning_rate': 0.1,
  'gamma': 0.1,
  'colsample_bytree': 1.0}

In [19]:
# Use only XGBoost model for prediction
xgb_model_ex = XGBClassifier(**random_search_xgb, random_state=42)

# Fit the XGBoost model
xgb_model_ex.fit(X_train, y_train)

# Make predictions using XGBoost
y_pred_xgb_proba = xgb_model_ex.predict_proba(X_test_top6)[:, 1]

# Convert probabilities to binary predictions based on a threshold (0.5)
y_pred_xgb = (y_pred_xgb_proba >= 0.5).astype(int)

# Step 5: Evaluate the XGBoost model using classification report
print("Classification Report (XGBoost):")
print(classification_report(y_test, y_pred_xgb))

Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     58484
           1       0.20      0.63      0.30      5475

    accuracy                           0.75     63959
   macro avg       0.58      0.69      0.57     63959
weighted avg       0.89      0.75      0.80     63959



In [23]:
#conclusion of the day
#transformation pipeline is faulty
#task to do - correct the pipeline, push train and transform pipeline, work on evaluation pipeline