<a href="https://colab.research.google.com/github/viveknarapongu2801/heart_predict/blob/main/Heart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
df=pd.read_csv('/content/heart.csv')


In [8]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [11]:
df.shape

(918, 12)

In [12]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (734, 11)
Testing data shape: (184, 11)


In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify categorical columns
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

# Create a column transformer with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough' # Keep the remaining columns
)

# Apply the transformer to the training and testing data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

print("Encoded training data shape:", X_train_encoded.shape)
print("Encoded testing data shape:", X_test_encoded.shape)

Encoded training data shape: (734, 20)
Encoded testing data shape: (184, 20)


In [15]:
import xgboost as xgb

# Initialize XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Train the model
model.fit(X_train_encoded, y_train)

print("XGBoost model trained successfully!")

XGBoost model trained successfully!


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the testing data
y_pred = model.predict(X_test_encoded)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.8804
Precision: 0.9126
Recall: 0.8785
F1-score: 0.8952


In [18]:
# ===============================
# ❤️ HEART DISEASE DATA PREPROCESSING
# ===============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Load dataset
df = pd.read_csv("heart.csv")

# ------------------------------
# 1️⃣ Separate Features and Target
# ------------------------------
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]

# ------------------------------
# 2️⃣ Identify Column Types
# ------------------------------
numeric_features = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
binary_features = ["Sex", "ExerciseAngina", "FastingBS"]
categorical_features = ["ChestPainType", "RestingECG", "ST_Slope"]

# ------------------------------
# 3️⃣ Label Encode Binary Columns
# ------------------------------
le = LabelEncoder()
for col in binary_features:
    X[col] = le.fit_transform(X[col])

# ------------------------------
# 4️⃣ One-Hot Encode Nominal Columns
# ------------------------------
ct = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'  # keep other columns
)

X_encoded = pd.DataFrame(ct.fit_transform(X))

# Fix column names after encoding
ohe_cols = ct.named_transformers_["onehot"].get_feature_names_out(categorical_features)
X_encoded.columns = list(ohe_cols) + list(X.columns.drop(categorical_features))

# ------------------------------
# 5️⃣ Optional Scaling (for linear models only)
# ------------------------------
# ⚠️ Skip this if you're using XGBoost or Random Forest
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_encoded), columns=X_encoded.columns)

# ------------------------------
# 6️⃣ Train-Test Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression

# Initialize Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model using the scaled data
model.fit(X_scaled, y)

print("Logistic Regression model trained successfully!")

Logistic Regression model trained successfully!


In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.5435
Precision: 0.8966
Recall: 0.2430
F1-score: 0.3824


In [21]:
import xgboost as xgb

# Initialize XGBoost classifier
model_xgb = xgb.XGBClassifier(random_state=42)

# Train the model using the encoded data (not scaled for XGBoost)
model_xgb.fit(X_train, y_train)

print("XGBoost model trained successfully!")

XGBoost model trained successfully!


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the testing data using the XGBoost model
y_pred_xgb = model_xgb.predict(X_test)

# Calculate evaluation metrics for the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)

print(f"XGBoost Accuracy: {accuracy_xgb:.4f}")
print(f"XGBoost Precision: {precision_xgb:.4f}")
print(f"XGBoost Recall: {recall_xgb:.4f}")
print(f"XGBoost F1-score: {f1_xgb:.4f}")

XGBoost Accuracy: 0.8750
XGBoost Precision: 0.8962
XGBoost Recall: 0.8879
XGBoost F1-score: 0.8920


In [32]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Load your trained model here
model = joblib.load('model_heart.pkl')

st.title('Heart Disease Prediction App')

st.write("""
This app predicts the likelihood of heart disease based on your inputs.
Please fill in the details below:
""")

# Define the features and their types (based on your preprocessing in the notebook)
numeric_features = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
binary_features = ["Sex", "ExerciseAngina", "FastingBS"]
categorical_features = ["ChestPainType", "RestingECG", "ST_Slope"]

# Create input fields for your model's features
age = st.slider('Age', 18, 100, 50)
sex = st.selectbox('Sex', ['M', 'F'])
chest_pain_type = st.selectbox('Chest Pain Type', ['ATA', 'NAP', 'ASY', 'TA'])
resting_bp = st.number_input('Resting Blood Pressure (RestingBP)', 50, 200, 120)
cholesterol = st.number_input('Cholesterol', 0, 600, 200)
fasting_bs = st.selectbox('Fasting Blood Sugar > 120 mg/dL (FastingBS)', [0, 1])
resting_ecg = st.selectbox('Resting Electrocardiogram (RestingECG)', ['Normal', 'ST', 'LVH'])
max_hr = st.number_input('Maximum Heart Rate Achieved (MaxHR)', 60, 220, 150)
exercise_angina = st.selectbox('Exercise Induced Angina (ExerciseAngina)', ['N', 'Y'])
oldpeak = st.number_input('Oldpeak', 0.0, 6.2, 1.0)
st_slope = st.selectbox('ST Slope', ['Up', 'Flat', 'Down'])


# Create a dictionary with user inputs
user_input = {
    'Age': age,
    'Sex': sex,
    'ChestPainType': chest_pain_type,
    'RestingBP': resting_bp,
    'Cholesterol': cholesterol,
    'FastingBS': fasting_bs,
    'RestingECG': resting_ecg,
    'MaxHR': max_hr,
    'ExerciseAngina': exercise_angina,
    'Oldpeak': oldpeak,
    'ST_Slope': st_slope
}

# Convert user input to a pandas DataFrame
input_df = pd.DataFrame([user_input])

# Preprocess the input data
# You need to replicate the preprocessing steps from your notebook
# Label encode binary features
le_sex = LabelEncoder()
input_df['Sex'] = le_sex.fit_transform(input_df['Sex']) # Fit and transform on a small sample that includes both 'M' and 'F' if possible to avoid errors

le_exercise_angina = LabelEncoder()
input_df['ExerciseAngina'] = le_exercise_angina.fit_transform(input_df['ExerciseAngina']) # Fit and transform on a small sample that includes both 'N' and 'Y' if possible to avoid errors

# One-hot encode categorical features
# To ensure consistent columns, fit on the original training data features if possible
# For a simple app, we can define the transformer with the known categories
ct = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(drop='first', sparse_output=False, categories=[['ATA', 'NAP', 'ASY', 'TA'], ['Normal', 'ST', 'LVH'], ['Up', 'Flat', 'Down']]), categorical_features)
    ],
    remainder='passthrough'  # keep other columns
)

# Create a dummy DataFrame with all possible categories to fit the ColumnTransformer
# This is a workaround to ensure the transformer has all categories even if the user input doesn't include them all
dummy_data = pd.DataFrame({
    'Sex': ['M', 'F'],
    'ChestPainType': ['ATA', 'NAP', 'ASY', 'TA'],
    'RestingECG': ['Normal', 'ST', 'LVH'],
    'ExerciseAngina': ['N', 'Y'],
    'ST_Slope': ['Up', 'Flat', 'Down'],
    'Age': [50, 50],
    'RestingBP': [120, 120],
    'Cholesterol': [200, 200],
    'FastingBS': [0, 1],
    'MaxHR': [150, 150],
    'Oldpeak': [1.0, 1.0]
})

# Apply the same label encoding to dummy data for fitting the ColumnTransformer
le_sex_dummy = LabelEncoder()
dummy_data['Sex'] = le_sex_dummy.fit_transform(dummy_data['Sex'])

le_exercise_angina_dummy = LabelEncoder()
dummy_data['ExerciseAngina'] = le_exercise_angina_dummy.fit_transform(dummy_data['ExerciseAngina'])


ct.fit(dummy_data[categorical_features + numeric_features + binary_features]) # Fit ColumnTransformer with all features

input_encoded = ct.transform(input_df)

# Convert the numpy array back to a DataFrame with correct column names if needed for your model (XGBoost usually works with numpy)
# ohe_cols = ct.named_transformers_["onehot"].get_feature_names_out(categorical_features)
# encoded_col_names = list(ohe_cols) + [col for col in input_df.columns if col not in categorical_features]
# input_encoded_df = pd.DataFrame(input_encoded, columns=encoded_col_names)


# Make prediction when a button is clicked
if st.button('Predict'):
    prediction = model.predict(input_encoded)
    prediction_proba = model.predict_proba(input_encoded)[:, 1] # For probability

    # Display the prediction result
    if prediction[0] == 1:
        st.error('Prediction: High risk of heart disease')
    else:
        st.success('Prediction: Low risk of heart disease')

    st.write(f'Confidence: {prediction_proba[0]:.2f}')

st.write("Please fill in all the required fields and click 'Predict'.")

Writing app.py


In [30]:
import joblib

# Save the trained XGBoost model
joblib.dump(model_xgb, 'model_heart.pkl')

print("XGBoost model saved as model_heart.pkl")

XGBoost model saved as model_heart.pkl


In [31]:
%%writefile requirements.txt
pandas
scikit-learn
xgboost
streamlit
joblib

Writing requirements.txt
