# Prediction of Cardiovascular Disease using Machine Learning Techniques

# Cardiovascular disease (CVD) is one of the leading causes of death worldwide. Early detection and preventive measures can save lives. This project uses machine learning models, Logistic Regression and Random Forest, to predict the likelihood of cardiovascular disease using patient health metrics such as age, gender, blood pressure, cholesterol, glucose levels, and lifestyle factors. The models are trained, evaluated, and deployed for predictive analytics

# import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# 2. Load dataset

In [3]:
df = pd.read_csv("/content/NACC_APOE_CVD_filtered (2).csv")

In [4]:
df.head()

Unnamed: 0,NACCID,SEX,BIRTHYR,NACCAPOE,DEMENTED,CVHATT,HATTMULT,CVAFIB,CVANGIO,CVBYPASS,...,STROKE,STROKIF,STROKDEC,STKIMAG,CVD,CVDIF,VASC,VASCIF,VASCPS,VASCPSIF
0,NACC000011,2,1944,1.0,0,0.0,,0.0,0.0,0.0,...,0.0,7.0,,,,,0.0,7.0,,
1,NACC000034,2,1935,4.0,0,0.0,8.0,0.0,0.0,0.0,...,,,8.0,8.0,0.0,7.0,,,,
2,NACC000067,1,1952,1.0,0,0.0,,0.0,0.0,0.0,...,0.0,7.0,,,,,0.0,7.0,0.0,7.0
3,NACC000095,1,1926,2.0,1,0.0,,0.0,0.0,0.0,...,0.0,7.0,,,,,0.0,7.0,0.0,7.0
4,NACC000144,1,1930,1.0,0,0.0,,1.0,0.0,0.0,...,0.0,8.0,,,,,8.0,8.0,8.0,8.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40686 entries, 0 to 40685
Data columns (total 43 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   NACCID    40686 non-null  object 
 1   SEX       40686 non-null  int64  
 2   BIRTHYR   40686 non-null  int64  
 3   NACCAPOE  40686 non-null  float64
 4   DEMENTED  40686 non-null  int64  
 5   CVHATT    29582 non-null  float64
 6   HATTMULT  7713 non-null   float64
 7   CVAFIB    29536 non-null  float64
 8   CVANGIO   29624 non-null  float64
 9   CVBYPASS  29633 non-null  float64
 10  CVPACDEF  7742 non-null   float64
 11  CVPACE    21901 non-null  float64
 12  CVCHF     29598 non-null  float64
 13  CVANGINA  7738 non-null   float64
 14  CVHVALVE  7738 non-null   float64
 15  CVOTHR    29536 non-null  float64
 16  CVOTHRX   3347 non-null   object 
 17  MYOINF    18764 non-null  float64
 18  CONGHRT   18764 non-null  float64
 19  AFIBRILL  18764 non-null  float64
 20  ANGINA    18764 non-null  fl

# 3. Handle missing / categorical data

In [6]:
if df['SEX'].dtype == 'object':
    le = LabelEncoder()
    df['SEX'] = le.fit_transform(df['SEX'])

# Convert age from days to years

In [7]:
# Assuming a current year for age calculation
current_year = 2023
df['age_years'] = current_year - df['BIRTHYR']

# Calculate BMI

In [8]:
# df['BMI'] = df['weight'] / ((df['height']/100)**2)
# The 'weight' and 'height' columns are not found in the dataset.
# Please ensure these columns exist or provide a way to derive them if BMI is required.

# Missing values

In [11]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [12]:
df.isnull().sum()

Unnamed: 0,0
NACCID,0
SEX,0
BIRTHYR,0
NACCAPOE,0
DEMENTED,0
CVHATT,0
HATTMULT,0
CVAFIB,0
CVANGIO,0
CVBYPASS,0


# 4. Features & Target

In [13]:
X = df.drop(['CVD', 'NACCID', 'CVOTHRX'], axis=1)
y = df['CVD']
# Convert y to binary (0 or 1) for classification
y = (y > 0).astype(int)

# 5. Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 )

# 6. Feature Scaling

In [15]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 7. Model Building

# Logistic Regression

In [16]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [17]:
# Re-running Train-Test Split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 )

In [18]:
# Re-running Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [19]:
# Re-running Logistic Regression model training
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Random Forest

In [20]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# 8. Evaluation

In [21]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"=== {model_name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n")

evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

=== Logistic Regression ===
Accuracy: 0.9965593511919391
Confusion Matrix:
 [[3472    0]
 [  28 4638]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      3472
           1       1.00      0.99      1.00      4666

    accuracy                           1.00      8138
   macro avg       1.00      1.00      1.00      8138
weighted avg       1.00      1.00      1.00      8138



=== Random Forest ===
Accuracy: 0.9963135905627919
Confusion Matrix:
 [[3470    2]
 [  28 4638]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      3472
           1       1.00      0.99      1.00      4666

    accuracy                           1.00      8138
   macro avg       1.00      1.00      1.00      8138
weighted avg       1.00      1.00      1.00      8138





# 9. Save models

In [22]:
joblib.dump(log_model, "cvd_logistic_model.pkl")
joblib.dump(rf_model, "cvd_rf_model.pkl")
joblib.dump(sc, "cvd_scaler.pkl")

['cvd_scaler.pkl']

# Step 1: Required files

# Tumhare paas ye files already trained honi chahiye:

# cvd_rf_model.pkl

# cvd_scaler.pkl

# (yeh hum pehle bana chuke hain)

# Step 2: Enhanced Streamlit App Code

# install library

In [26]:
!pip install streamlit



# import libraries

In [27]:
import streamlit as st
import pandas as pd
import joblib

# Load Model & Scaler

In [29]:
model = joblib.load("cvd_rf_model.pkl")
scaler = joblib.load("cvd_scaler.pkl")

# App Title

In [31]:
st.set_page_config(page_title="CVD Prediction App", layout="centered")
st.title("‚ù§Ô∏è Cardiovascular Disease Prediction")
st.write("Fill patient details to check CVD risk")

2026-01-27 17:19:38.391 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


# User Inputs

In [33]:
age = st.number_input("Age (years)", min_value=10, max_value=100, value=45)
gender = st.selectbox("Gender", ["Male", "Female"])
height = st.number_input("Height (cm)", min_value=120, max_value=220, value=170)
weight = st.number_input("Weight (kg)", min_value=30, max_value=200, value=70)

ap_hi = st.number_input("Systolic Blood Pressure", min_value=80, max_value=250, value=120)
ap_lo = st.number_input("Diastolic Blood Pressure", min_value=50, max_value=150, value=80)

cholesterol = st.selectbox("Cholesterol Level", ["Normal", "Above Normal", "High"])
gluc = st.selectbox("Glucose Level", ["Normal", "Above Normal", "High"])

smoke = st.selectbox("Smoking", ["No", "Yes"])
alco = st.selectbox("Alcohol Intake", ["No", "Yes"])
active = st.selectbox("Physically Active", ["No", "Yes"])

2026-01-27 17:20:33.182 Session state does not function when running a script without `streamlit run`


# Encode Inputs

In [34]:
gender = 1 if gender == "Male" else 2
cholesterol = {"Normal":1, "Above Normal":2, "High":3}[cholesterol]
gluc = {"Normal":1, "Above Normal":2, "High":3}[gluc]
smoke = 1 if smoke == "Yes" else 0
alco = 1 if alco == "Yes" else 0
active = 1 if active == "Yes" else 0

# Prediction Button

In [38]:
if st.button("üîç Predict CVD Risk"):
    # BMI calculation
    bmi = weight / ((height / 100) ** 2)

    # Create DataFrame
    input_data = pd.DataFrame([{
        'gender': gender,
        'height': height,
        'weight': weight,
        'ap_hi': ap_hi,
        'ap_lo': ap_lo,
        'cholesterol': cholesterol,
        'gluc': gluc,
        'smoke': smoke,
        'alco': alco,
        'active': active,
        'age_years': age,
        'BMI': bmi
    }])

    # Scale data
    input_scaled = scaler.transform(input_data)

    # Predict
    prediction = model.predict(input_scaled)[0]

    # Output
    st.subheader("Result")
    if prediction == 1:
        st.error("‚ö†Ô∏è HIGH Risk of Cardiovascular Disease")
    else:
        st.success("‚úÖ LOW Risk of Cardiovascular Disease")

    st.write(f"**Calculated BMI:** {bmi:.2f}")



In [47]:
!pip install pyngrok



# ‚ù§Ô∏è Cardiovascular Disease (CVD) Prediction App

This project is a **Machine Learning‚Äìbased web application** that predicts whether a patient is at **HIGH or LOW risk of Cardiovascular Disease (CVD)** based on medical and lifestyle data.

The application is built using:
- **Python**
- **Scikit-learn**
- **Gradio** (for online interactive interface)

---

## üìå Project Objective

The main objective of this project is to:
- Predict the presence of cardiovascular disease at an early stage
- Assist healthcare professionals in decision-making
- Demonstrate the practical use of Machine Learning in healthcare

---

## üß† Machine Learning Model

- **Algorithm Used:** Random Forest Classifier  
- **Problem Type:** Binary Classification  
  - `0` ‚Üí No CVD (Low Risk)  
  - `1` ‚Üí CVD Present (High Risk)

The model is trained on a cardiovascular dataset containing patient health records.

---

## üìä Input Features

The application takes the following inputs:

| Feature | Description |
|------|------------|
| Age | Age of the patient (years) |
| Gender | 1 = Male, 2 = Female |
| Height | Height in centimeters |
| Weight | Weight in kilograms |
| Systolic BP | Upper blood pressure |
| Diastolic BP | Lower blood pressure |
| Cholesterol | 1 = Normal, 2 = Above Normal, 3 = High |
| Glucose | 1 = Normal, 2 = Above Normal, 3 = High |
| Smoking | 0 = No, 1 = Yes |
| Alcohol | 0 = No, 1 = Yes |
| Physical Activity | 0 = No, 1 = Yes |

---

## ‚öôÔ∏è Feature Engineering

- **BMI (Body Mass Index)** is calculated automatically:
  


In [50]:
import gradio as gr
import pandas as pd
import joblib

# Load model & scaler
model = joblib.load("cvd_rf_model.pkl")
scaler = joblib.load("cvd_scaler.pkl")

def predict_cvd(age, gender, height, weight, ap_hi, ap_lo,
                cholesterol, gluc, smoke, alco, active):

    bmi = weight / ((height / 100) ** 2)

    data = pd.DataFrame([{
        'gender': gender,
        'height': height,
        'weight': weight,
        'ap_hi': ap_hi,
        'ap_lo': ap_lo,
        'cholesterol': cholesterol,
        'gluc': gluc,
        'smoke': smoke,
        'alco': alco,
        'active': active,
        'age_years': age,
        'BMI': bmi
    }])

    data_scaled = scaler.transform(data)
    prediction = model.predict(data_scaled)[0]

    if prediction == 1:
        return f"‚ö†Ô∏è HIGH Risk of CVD\nBMI: {bmi:.2f}"
    else:
        return f"‚úÖ LOW Risk of CVD\nBMI: {bmi:.2f}"

app = gr.Interface(
    fn=predict_cvd,
    inputs=[
        gr.Number(label="Age (years)"),
        gr.Radio([1,2], label="Gender (1=Male, 2=Female)"),
        gr.Number(label="Height (cm)"),
        gr.Number(label="Weight (kg)"),
        gr.Number(label="Systolic BP"),
        gr.Number(label="Diastolic BP"),
        gr.Radio([1,2,3], label="Cholesterol (1=Normal,2=Above,3=High)"),
        gr.Radio([1,2,3], label="Glucose (1=Normal,2=Above,3=High)"),
        gr.Radio([0,1], label="Smoking (0=No,1=Yes)"),
        gr.Radio([0,1], label="Alcohol (0=No,1=Yes)"),
        gr.Radio([0,1], label="Physically Active (0=No,1=Yes)")
    ],
    outputs="text",
    title="‚ù§Ô∏è Cardiovascular Disease Prediction App",
    description="Enter patient data to predict CVD risk"
)

app.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://64722b4aa5cb02db96.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


