In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

dataset_path = 'Heart_Disease_Prediction.csv'
data = pd.read_csv(dataset_path)

print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())
print("\nSummary Statistics:")
print(data.describe())

print("\nMissing Values:")
print(data.isnull().sum())

numeric_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(exclude=['number']).columns

data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

imputer = IterativeImputer(random_state=42)
data_imputed = pd.DataFrame(imputer.fit_transform(data_encoded), columns=data_encoded.columns)

data_imputed = pd.get_dummies(data_imputed, drop_first=True)

print("\nColumn Names in Data:")
print(data_imputed.columns)

if 'Heart Disease' not in data_imputed.columns:

    heart_disease_col = [col for col in data_imputed.columns if 'Heart Disease' in col]
    if heart_disease_col:
        heart_disease_col = heart_disease_col[0]
        X = data_imputed.drop(heart_disease_col, axis=1)
        y = data_imputed[heart_disease_col]
    else:
        raise KeyError("Could not find 'Heart Disease' or its encoded form in the DataFrame.")
else:
    X = data_imputed.drop('Heart Disease', axis=1)
    y = data_imputed['Heart Disease']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#### Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_y_pred))
print("Classification Report:")
print(classification_report(y_test, rf_y_pred))

#### Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_y_pred = nb_model.predict(X_test)
print("\nNaive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, nb_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, nb_y_pred))
print("Classification Report:")
print(classification_report(y_test, nb_y_pred))

#### Support Vector Machine (SVM)
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_test)
print("\nSVM Results:")
print("Accuracy:", accuracy_score(y_test, svm_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_y_pred))
print("Classification Report:")
print(classification_report(y_test, svm_y_pred))

rf_model_filename = 'heart_disease_rf_model.pkl'
dt_model_filename = 'heart_disease_dt_model.pkl'
scaler_filename = 'heart_scaler.pkl'
joblib.dump(rf_model, rf_model_filename)
joblib.dump(dt_model, dt_model_filename)
joblib.dump(scaler, scaler_filename)
print("\nRandom Forest and Decision Tree Models saved as 'heart_disease_rf_model.pkl' and 'heart_disease_dt_model.pkl'.")
print("Scaler saved as 'heart_scaler.pkl'.")

print("\nProvide new data for prediction:")
input_data = []

feature_names = X.columns
for feature in feature_names:
    value = float(input(f"Enter value for {feature}: "))
    input_data.append(value)

new_data = np.array([input_data])

scaler = joblib.load(scaler_filename)
new_data_scaled = scaler.transform(new_data)

#### Random Forest Prediction
rf_model = joblib.load(rf_model_filename)
rf_prediction = rf_model.predict(new_data_scaled)[0]
rf_probabilities = rf_model.predict_proba(new_data_scaled)[0] * 100
print(f"\nRandom Forest Heart Disease Prediction: {rf_prediction}")
print(f"Prediction Probability: {rf_probabilities[int(rf_prediction)]:.2f}%")



#### Naive Bayes Prediction
nb_prediction = nb_model.predict(new_data_scaled)[0]
nb_probabilities = nb_model.predict_proba(new_data_scaled)[0] * 100
print(f"\nNaive Bayes Heart Disease Prediction: {nb_prediction}")
print(f"Prediction Probability: {nb_probabilities[int(nb_prediction)]:.2f}%")

#### SVM Prediction
svm_prediction = svm_model.predict(new_data_scaled)[0]
svm_probabilities = svm_model.predict_proba(new_data_scaled)[0] * 100
print(f"\nSVM Heart Disease Prediction: {svm_prediction}")
print(f"Prediction Probability: {svm_probabilities[int(svm_prediction)]:.2f}%")


Dataset Head:
   Age  Sex  Chest pain type   BP  Cholesterol  FBS over 120  EKG results  \
0   70    1                4  130          322             0            2   
1   67    0                3  115          564             0            2   
2   57    1                2  124          261             0            0   
3   64    1                4  128          263             0            0   
4   74    0                2  120          269             0            2   

   Max HR  Exercise angina  ST depression  Slope of ST  \
0     109                0            2.4            2   
1     160                0            1.6            2   
2     141                0            0.3            1   
3     105                1            0.2            2   
4     121                1            0.2            1   

   Number of vessels fluro  Thallium Heart Disease  
0                        3         3      Presence  
1                        0         7       Absence  
2              

