In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report ,roc_auc_score,f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib # Model saving ke liye

# --- 1. Data Cleaning Steps (Task 1.1) ---
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 
    'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]
data_path = 'Data/processed.cleveland.data' 
df = pd.read_csv(data_path, names=column_names, na_values='?', header=None)

# Handle Missing Values (Impute with Mode)
for col in ['ca', 'thal']:
    df[col] = df[col].fillna(df[col].mode()[0])

# Binary Target Transformation
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

print("Data Cleaned and Loaded.")

Data Cleaned and Loaded.


In [2]:
X = df.drop('target', axis=1)
y = df['target']

# Data ko 80% Training aur 20% Testing mein split karo
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

Training set size: 242, Testing set size: 61


In [3]:
# Features ki categories:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_nominal_features = ['cp', 'restecg', 'slope', 'ca', 'thal'] 
# Binary/Ordinal features (sex, fbs, exang) ko hum 'remainder' mein chhod denge.

# Preprocessor: ColumnTransformer (StandardScaler + OneHotEncoder)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features), # Numerical data ko scale karega
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_nominal_features) # Categorical data ko one-hot encode karega
    ],
    remainder='passthrough' # Baaki features (sex, fbs, exang) ko as is rakhega
)

print("Preprocessing Pipeline (ColumnTransformer) created.")

Preprocessing Pipeline (ColumnTransformer) created.


In [4]:
# Logistic Regression Pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Pehle preprocessor chalao
    ('classifier', LogisticRegression(random_state=42, solver='liblinear')) # Phir model train karo
])

print("Training Logistic Regression...")
lr_pipeline.fit(X_train, y_train)

# Evaluation
lr_pred = lr_pipeline.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)

print(f"\n--- Logistic Regression Results ---")
print(f"Accuracy: {lr_accuracy:.4f}")
print(classification_report(y_test, lr_pred))

Training Logistic Regression...

--- Logistic Regression Results ---
Accuracy: 0.8852
              precision    recall  f1-score   support

           0       0.93      0.85      0.89        33
           1       0.84      0.93      0.88        28

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61



In [5]:
# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_estimators=100))
])

print("Training Random Forest...")
rf_pipeline.fit(X_train, y_train)

# Evaluation
rf_pred = rf_pipeline.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"\n--- Random Forest Results ---")
print(f"Accuracy: {rf_accuracy:.4f}")
print(classification_report(y_test, rf_pred))

Training Random Forest...

--- Random Forest Results ---
Accuracy: 0.9180
              precision    recall  f1-score   support

           0       0.97      0.88      0.92        33
           1       0.87      0.96      0.92        28

    accuracy                           0.92        61
   macro avg       0.92      0.92      0.92        61
weighted avg       0.92      0.92      0.92        61



In [6]:
# Best model ko final_pipeline variable mein store karo
final_pipeline = rf_pipeline 
best_model_name = "random_forest"

# --- Deliverable Requirement: Save Model Artifact ---
# Project root mein 'models' folder banao
if not os.path.exists('./models'):
    os.makedirs('./models')

model_filename = f'models/{best_model_name}_pipeline.pkl'
joblib.dump(final_pipeline, model_filename)

print(f"\nBest Model Pipeline saved to: {model_filename}")


Best Model Pipeline saved to: models/random_forest_pipeline.pkl


In [7]:
# Task 3 Started


import mlflow

# Tracking URI dobara set kar dete hain (Safety ke liye)
mlflow.set_tracking_uri("./mlruns") 

# Experiment ka naam
experiment_name = "Heart_Disease_Prediction_Assignment"

# Check aur Create Logic
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    # Agar experiment nahi mila, toh naya bana do
    mlflow.create_experiment(experiment_name)
    print(f"Created new experiment: {experiment_name}")

# Ab Experiment set karo
mlflow.set_experiment(experiment_name)

print(f"Tracking to experiment: {experiment_name}")

Tracking to experiment: Heart_Disease_Prediction_Assignment


  return FileStore(store_uri, store_uri)


In [8]:
def train_and_log_model(pipeline, X_train, y_train, X_test, y_test, model_name):
    
    # MLflow run shuru karo
    with mlflow.start_run(run_name=model_name) as run:
        
        # 1. Model Train Karo (Pipeline already created in Task 2)
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_proba = pipeline.predict_proba(X_test)[:, 1] 
        
        # 2. Metrics Calculate Karo
        accuracy = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba)
        f1 = f1_score(y_test, y_pred)
        
        # 3. Parameters Log Karo (Reproducibility ke liye)
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("model_name", model_name)
        
        if model_name == "LogisticRegression":
            params = pipeline.steps[-1][1].get_params()
            mlflow.log_params({k: v for k, v in params.items() if k in ['solver', 'C']})
        elif model_name == "RandomForest":
            mlflow.log_param("n_estimators", pipeline.steps[-1][1].get_params()['n_estimators'])
        
        # 4. Metrics Log Karo (Reporting ke liye)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", auc)

        # 5. Model Artifact Save Karo (Dockerization ke liye)
        mlflow.sklearn.log_model(
            sk_model=pipeline, 
            # artifact_path="model_artifact", # Line hata di
            registered_model_name=f"{model_name}_Heart_Classifier" 
        )
        
        print(f"--- {model_name} Logged to MLflow. ROC AUC: {auc:.4f} ---")
        return auc

In [9]:
# Cell 9: Models Run Karo

# Model 1 Run
lr_auc = train_and_log_model(lr_pipeline, X_train, y_train, X_test, y_test, "LogisticRegression") 

# Model 2 Run
rf_auc = train_and_log_model(rf_pipeline, X_train, y_train, X_test, y_test, "RandomForest") 

# Best Model Select Karo
best_model_pipeline = rf_pipeline if rf_auc > lr_auc else lr_pipeline
best_model_name = "random_forest" if rf_auc > lr_auc else "logistic_regression"

print(f"\nFinal Decision: {best_model_name} is the best model (ROC AUC: {max(lr_auc, rf_auc):.4f}).")
print("Check MLflow UI for detailed comparison.")

  return FileStore(store_uri)
Registered model 'LogisticRegression_Heart_Classifier' already exists. Creating a new version of this model...
Created version '2' of model 'LogisticRegression_Heart_Classifier'.


--- LogisticRegression Logged to MLflow. ROC AUC: 0.9686 ---
--- RandomForest Logged to MLflow. ROC AUC: 0.9475 ---

Final Decision: logistic_regression is the best model (ROC AUC: 0.9686).
Check MLflow UI for detailed comparison.


Registered model 'RandomForest_Heart_Classifier' already exists. Creating a new version of this model...
Created version '2' of model 'RandomForest_Heart_Classifier'.
