<a href="https://colab.research.google.com/github/yeshwanthbalaji22/advanced_ml_project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
    cross_val_score,
    GridSearchCV,
    train_test_split
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

df = pd.read_csv("heart.csv")


X = df.drop("target", axis=1)
y = df["target"]

print("Dataset Shape:", df.shape)
print("Target Distribution:\n", y.value_counts())

kf = KFold(n_splits=5, shuffle=True, random_state=42)

rf_model = RandomForestClassifier(random_state=42)

kf_scores = cross_val_score(
    rf_model,
    X,
    y,
    cv=kf,
    scoring="accuracy"
)

print("\nK-Fold Accuracy Scores:", kf_scores)
print("Mean K-Fold Accuracy:", kf_scores.mean())


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

skf_scores = cross_val_score(
    rf_model,
    X,
    y,
    cv=skf,
    scoring="accuracy"
)

print("\nStratified K-Fold Accuracy Scores:", skf_scores)
print("Mean Stratified Accuracy:", skf_scores.mean())


param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X, y)

print("\nBest Random Forest Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

best_rf = grid_search.best_estimator_


models = {
    "Random Forest": best_rf,
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": Pipeline([
        ("scaler", StandardScaler()),
        ("svm", SVC(kernel="rbf"))
    ])
}

print("\nModel Comparison (Stratified K-Fold Accuracy):")
for name, model in models.items():
    scores = cross_val_score(
        model,
        X,
        y,
        cv=skf,
        scoring="accuracy"
    )
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}")


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)

print("\nClassification Report (Random Forest):\n")
print(classification_report(y_test, y_pred))


Dataset Shape: (1025, 14)
Target Distribution:
 target
1    526
0    499
Name: count, dtype: int64

K-Fold Accuracy Scores: [0.98536585 1.         1.         1.         1.        ]
Mean K-Fold Accuracy: 0.9970731707317073

Stratified K-Fold Accuracy Scores: [1.        1.        1.        0.9804878 1.       ]
Mean Stratified Accuracy: 0.9960975609756098

Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.9970731707317073

Model Comparison (Stratified K-Fold Accuracy):
Random Forest: Mean Accuracy = 0.9961
Decision Tree: Mean Accuracy = 1.0000
SVM: Mean Accuracy = 0.9239

Classification Report (Random Forest):

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       105

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted 