In [1]:
# === Step 1: Import Libraries ===

# Data Handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing & ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Save Model
import joblib
import os

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [2]:
# === Step 2: Load Dataset ===

DATA_PATH = r"C:\Users\SUDARSHAN\OneDrive\Desktop\HDP\heart\dataset\heart.csv"

# Load data
df = pd.read_csv(DATA_PATH)
print("✅ Dataset loaded successfully!\n")
print("Shape:", df.shape)
display(df.head())

✅ Dataset loaded successfully!

Shape: (1025, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [3]:
# === Step 3: Inspect Dataset ===

print("\nDataset Info:\n")
df.info()

print("\nMissing Values:\n")
print(df.isnull().sum())

print("\nUnique Values in Target:\n", df['target'].value_counts())


Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB

Missing Values:

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0


In [4]:
# === Step 4: Split the Data ===

X = df.drop('target', axis=1)  # Features
y = df['target']               # Target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✅ Data split successfully!")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

✅ Data split successfully!
Training samples: 820
Testing samples: 205


In [5]:
# === Step 5: Scale Features ===

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Feature scaling complete!")
print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

✅ Feature scaling complete!
Training set shape: (820, 13)
Testing set shape: (205, 13)


In [6]:
# === Step 6: Train Random Forest Model ===

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=250,       # Number of trees
    max_depth=10,           # Prevents overfitting
    min_samples_split=5,    # Minimum samples to split an internal node
    random_state=42
)

model.fit(X_train_scaled, y_train)

print("✅ Model training completed successfully!")

✅ Model training completed successfully!


In [7]:
# === Step 7: Evaluate Model ===

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_pred = model.predict(X_test_scaled)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Model Accuracy: {accuracy:.3f}")

Confusion Matrix:
 [[100   0]
 [  0 105]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       105

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205


✅ Model Accuracy: 1.000


In [8]:
# === Step 8: Cross-Validation for Stability ===

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores):.3f} ± {np.std(scores):.3f}")

Cross-Validation Accuracy: 0.994 ± 0.007


In [9]:
from sklearn.pipeline import Pipeline
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Create pipeline (scaler + model)
full_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(
        n_estimators=250,
        max_depth=10,
        min_samples_split=5,
        random_state=42
    ))
])

# Train on full dataset
full_pipeline.fit(X, y)

# Save final model
MODEL_PATH = r"C:\Users\SUDARSHAN\OneDrive\Desktop\HDP\heart\Prediction\ml_models\heart_pipeline_v6.joblib"
joblib.dump(full_pipeline, MODEL_PATH)

print(f"✅ Final model saved at: {MODEL_PATH}")

✅ Final model saved at: C:\Users\SUDARSHAN\OneDrive\Desktop\HDP\heart\Prediction\ml_models\heart_pipeline_v6.joblib
