# Prediction of Cardiovascular Disease using Machine Learning Techniques

# Cardiovascular disease (CVD) is one of the leading causes of death worldwide. Early detection and preventive measures can save lives. This project uses machine learning models, Logistic Regression and Random Forest, to predict the likelihood of cardiovascular disease using patient health metrics such as age, gender, blood pressure, cholesterol, glucose levels, and lifestyle factors. The models are trained, evaluated, and deployed for predictive analytics

# import library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# 2. Load dataset

In [3]:
df = pd.read_csv("/content/NACC_APOE_CVD_filtered (1).csv")

In [4]:
df.head()

Unnamed: 0,NACCID,SEX,BIRTHYR,NACCAPOE,DEMENTED,CVHATT,HATTMULT,CVAFIB,CVANGIO,CVBYPASS,...,STROKE,STROKIF,STROKDEC,STKIMAG,CVD,CVDIF,VASC,VASCIF,VASCPS,VASCPSIF
0,NACC000011,2,1944,1.0,0,0.0,,0.0,0.0,0.0,...,0.0,7.0,,,,,0.0,7.0,,
1,NACC000034,2,1935,4.0,0,0.0,8.0,0.0,0.0,0.0,...,,,8.0,8.0,0.0,7.0,,,,
2,NACC000067,1,1952,1.0,0,0.0,,0.0,0.0,0.0,...,0.0,7.0,,,,,0.0,7.0,0.0,7.0
3,NACC000095,1,1926,2.0,1,0.0,,0.0,0.0,0.0,...,0.0,7.0,,,,,0.0,7.0,0.0,7.0
4,NACC000144,1,1930,1.0,0,0.0,,1.0,0.0,0.0,...,0.0,8.0,,,,,8.0,8.0,8.0,8.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40686 entries, 0 to 40685
Data columns (total 43 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   NACCID    40686 non-null  object 
 1   SEX       40686 non-null  int64  
 2   BIRTHYR   40686 non-null  int64  
 3   NACCAPOE  40686 non-null  float64
 4   DEMENTED  40686 non-null  int64  
 5   CVHATT    29582 non-null  float64
 6   HATTMULT  7713 non-null   float64
 7   CVAFIB    29536 non-null  float64
 8   CVANGIO   29624 non-null  float64
 9   CVBYPASS  29633 non-null  float64
 10  CVPACDEF  7742 non-null   float64
 11  CVPACE    21901 non-null  float64
 12  CVCHF     29598 non-null  float64
 13  CVANGINA  7738 non-null   float64
 14  CVHVALVE  7738 non-null   float64
 15  CVOTHR    29536 non-null  float64
 16  CVOTHRX   3347 non-null   object 
 17  MYOINF    18764 non-null  float64
 18  CONGHRT   18764 non-null  float64
 19  AFIBRILL  18764 non-null  float64
 20  ANGINA    18764 non-null  fl

# 3. Handle missing / categorical data

In [7]:
if df['SEX'].dtype == 'object':
    le = LabelEncoder()
    df['SEX'] = le.fit_transform(df['SEX'])

# Convert age from days to years

In [35]:
# Assuming a current year for age calculation
current_year = 2023
df['age_years'] = current_year - df['BIRTHYR']

# Calculate BMI

In [44]:
# df['BMI'] = df['weight'] / ((df['height']/100)**2)
# The 'weight' and 'height' columns are not found in the dataset.
# Please ensure these columns exist or provide a way to derive them if BMI is required.

In [42]:
def predict_new_patient(data_dict):
    """
    Predicts cardiovascular disease risk for a new patient.

    Args:
        data_dict (dict): A dictionary containing patient's health metrics.
                          It must contain all the features the model was trained on,
                          which are the columns of the global `X` DataFrame.

                          Example required keys (and their types/ranges):
                          - 'SEX': int (1 or 2)
                          - 'BIRTHYR': int (e.g., 1950)
                          - 'NACCAPOE': float (e.g., 1.0)
                          - 'DEMENTED': int (0 or 1)
                          - 'CVHATT': float (e.g., 0.0)
                          - 'HATTMULT': float (e.g., 2.8, mean-filled)
                          - 'CVAFIB': float (e.g., 0.0)
                          - 'CVANGIO': float (e.g., 0.0)
                          - 'CVBYPASS': float (e.g., 0.0)
                          - 'CVPACDEF': float (e.g., 0.0)
                          - 'CVPACE': float (e.g., 0.0)
                          - 'CVCHF': float (e.g., 0.0)
                          - 'CVANGINA': float (e.g., 0.0)
                          - 'CVHVALVE': float (e.g., 0.0)
                          - 'CVOTHR': float (e.g., 0.0)
                          - 'MYOINF': float (e.g., 0.0)
                          - 'CONGHRT': float (e.g., 0.0)
                          - 'AFIBRILL': float (e.g., 0.0)
                          - 'ANGINA': float (e.g., 0.0)
                          - 'ANGIOCP': float (e.g., 0.0)
                          - 'ANGIOPCI': float (e.g., 0.0)
                          - 'PACEMAKE': float (e.g., 0.0)
                          - 'HVALVE': float (e.g., 0.0)
                          - 'CBSTROKE': float (e.g., 0.0)
                          - 'STROKMUL': float (e.g., 2.8, mean-filled)
                          - 'NACCSTYR': float (e.g., 2005.0)
                          - 'CBTIA': float (e.g., 0.0)
                          - 'TIAMULT': float (e.g., 2.8, mean-filled)
                          - 'NACCTIYR': float (e.g., 2005.0)
                          - 'HXSTROKE': float (e.g., 0.0)
                          - 'PREVSTK': float (e.g., 0.0)
                          - 'STROKE': float (e.g., 0.0)
                          - 'STROKIF': float (e.g., 7.1, mean-filled)
                          - 'STROKDEC': float (e.g., 0.0)
                          - 'STKIMAG': float (e.g., 0.0)
                          - 'CVDIF': float (e.g., 7.1, mean-filled)
                          - 'VASC': float (e.g., 0.0)
                          - 'VASCIF': float (e.g., 7.1, mean-filled)
                          - 'VASCPS': float (e.g., 0.0)
                          - 'VASCPSIF': float (e.g., 7.1, mean-filled)
                          - 'age_years': int (e.g., 60)
    Returns:
        str: "HIGH risk of CVD" or "LOW risk of CVD"
    """
    # Create DataFrame from the input dictionary
    df_new = pd.DataFrame([data_dict])

    # Ensure the DataFrame has the exact same columns and order as X used for training
    # Any missing columns in data_dict will be filled with NaN (or a specified fill_value),
    # but it's best practice for data_dict to provide all necessary features.
    df_new = df_new.reindex(columns=X.columns, fill_value=0) # Using 0 as a default for missing features, adapt if needed.

    # Scale the new data using the fitted scaler
    df_new_scaled = sc.transform(df_new)

    # Predict using the Random Forest model
    pred = rf_model.predict(df_new_scaled)[0]

    return "HIGH risk of CVD" if pred == 1 else "LOW risk of CVD"

In [43]:
new_patient = {
    'SEX': 1, 'BIRTHYR': 1965, 'NACCAPOE': 2.0, 'DEMENTED': 0, 'CVHATT': 0.0,
    'HATTMULT': 2.801387, 'CVAFIB': 0.0, 'CVANGIO': 0.0, 'CVBYPASS': 0.0,
    'CVPACDEF': 0.0, 'CVPACE': 0.0, 'CVCHF': 0.0, 'CVANGINA': 0.0,
    'CVHVALVE': 0.0, 'CVOTHR': 0.0, 'MYOINF': 0.0, 'CONGHRT': 0.0,
    'AFIBRILL': 0.0, 'ANGINA': 0.0, 'ANGIOCP': 0.0, 'ANGIOPCI': 0.0,
    'PACEMAKE': 0.0, 'HVALVE': 0.0, 'CBSTROKE': 0.0, 'STROKMUL': 2.801387,
    'NACCSTYR': 2005.0, 'CBTIA': 0.0, 'TIAMULT': 2.801387, 'NACCTIYR': 2005.0,
    'HXSTROKE': 0.0, 'PREVSTK': 0.0, 'STROKE': 0.0, 'STROKIF': 7.128011,
    'STROKDEC': 0.0, 'STKIMAG': 0.0, 'CVDIF': 7.128011, 'VASC': 0.0,
    'VASCIF': 7.128011, 'VASCPS': 0.0, 'VASCPSIF': 7.128011, 'age_years': 58
}
print("Prediction for new patient:", predict_new_patient(new_patient))

Prediction for new patient: HIGH risk of CVD


# Drop original age column if desired

In [37]:
# df.drop('age', axis=1, inplace=True)
# The 'age' column was not found in the dataset.
# The 'age_years' column has been created from 'BIRTHYR'.
# If you intended to drop 'BIRTHYR', please modify this line.

# Encode categorical columns if needed

In [38]:
# categorical_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
# for col in categorical_cols:
#     if col in df.columns and df[col].dtype == 'object':
#         le = LabelEncoder()
#         df[col] = le.fit_transform(df[col])
# The columns 'gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'
# were not found in the dataset. The 'SEX' column (representing gender)
# was already checked for encoding in a previous step.

# Missing values

In [9]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [11]:
df.isnull().sum()

Unnamed: 0,0
NACCID,0
SEX,0
BIRTHYR,0
NACCAPOE,0
DEMENTED,0
CVHATT,0
HATTMULT,0
CVAFIB,0
CVANGIO,0
CVBYPASS,0


# 4. Features & Target

In [22]:
X = df.drop(['CVD', 'NACCID', 'CVOTHRX'], axis=1)
y = df['CVD']
# Convert y to binary (0 or 1) for classification
y = (y > 0).astype(int)

# 5. Train-Test Split

In [18]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 )

# 6. Feature Scaling

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 7. Model Building

# Logistic Regression

In [28]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [25]:
# Re-running Train-Test Split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 )

In [26]:
# Re-running Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [27]:
# Re-running Logistic Regression model training
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Random Forest

In [29]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# 8. Evaluation

In [30]:
def evaluate_model(y_test, y_pred, model_name):
    print(f"=== {model_name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("\n")

evaluate_model(y_test, y_pred_log, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")

=== Logistic Regression ===
Accuracy: 0.9965593511919391
Confusion Matrix:
 [[3472    0]
 [  28 4638]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      3472
           1       1.00      0.99      1.00      4666

    accuracy                           1.00      8138
   macro avg       1.00      1.00      1.00      8138
weighted avg       1.00      1.00      1.00      8138



=== Random Forest ===
Accuracy: 0.9961907102482183
Confusion Matrix:
 [[3469    3]
 [  28 4638]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      3472
           1       1.00      0.99      1.00      4666

    accuracy                           1.00      8138
   macro avg       1.00      1.00      1.00      8138
weighted avg       1.00      1.00      1.00      8138





# 9. Save models

In [32]:
joblib.dump(log_model, "cvd_logistic_model.pkl")
joblib.dump(rf_model, "cvd_rf_model.pkl")
joblib.dump(sc, "cvd_scaler.pkl")

['cvd_scaler.pkl']

# 11. Prediction for New Patient

In [40]:
def predict_new_patient(data_dict):
    """
    Predicts cardiovascular disease risk for a new patient.

    Args:
        data_dict (dict): A dictionary containing patient's health metrics.
                          It must contain all the features the model was trained on,
                          which are the columns of the global `X` DataFrame.

                          Example required keys (and their types/ranges):
                          - 'SEX': int (1 or 2)
                          - 'BIRTHYR': int (e.g., 1950)
                          - 'NACCAPOE': float (e.g., 1.0)
                          - 'DEMENTED': int (0 or 1)
                          - 'CVHATT': float (e.g., 0.0)
                          - 'HATTMULT': float (e.g., 2.8, mean-filled)
                          - 'CVAFIB': float (e.g., 0.0)
                          - 'CVANGIO': float (e.g., 0.0)
                          - 'CVBYPASS': float (e.g., 0.0)
                          - 'CVPACDEF': float (e.g., 0.0)
                          - 'CVPACE': float (e.g., 0.0)
                          - 'CVCHF': float (e.g., 0.0)
                          - 'CVANGINA': float (e.g., 0.0)
                          - 'CVHVALVE': float (e.g., 0.0)
                          - 'CVOTHR': float (e.g., 0.0)
                          - 'MYOINF': float (e.g., 0.0)
                          - 'CONGHRT': float (e.g., 0.0)
                          - 'AFIBRILL': float (e.g., 0.0)
                          - 'ANGINA': float (e.g., 0.0)
                          - 'ANGIOCP': float (e.g., 0.0)
                          - 'ANGIOPCI': float (e.g., 0.0)
                          - 'PACEMAKE': float (e.g., 0.0)
                          - 'HVALVE': float (e.g., 0.0)
                          - 'CBSTROKE': float (e.g., 0.0)
                          - 'STROKMUL': float (e.g., 2.8, mean-filled)
                          - 'NACCSTYR': float (e.g., 2005.0)
                          - 'CBTIA': float (e.g., 0.0)
                          - 'TIAMULT': float (e.g., 2.8, mean-filled)
                          - 'NACCTIYR': float (e.g., 2005.0)
                          - 'HXSTROKE': float (e.g., 0.0)
                          - 'PREVSTK': float (e.g., 0.0)
                          - 'STROKE': float (e.g., 0.0)
                          - 'STROKIF': float (e.g., 7.1, mean-filled)
                          - 'STROKDEC': float (e.g., 0.0)
                          - 'STKIMAG': float (e.g., 0.0)
                          - 'CVDIF': float (e.g., 7.1, mean-filled)
                          - 'VASC': float (e.g., 0.0)
                          - 'VASCIF': float (e.g., 7.1, mean-filled)
                          - 'VASCPS': float (e.g., 0.0)
                          - 'VASCPSIF': float (e.g., 7.1, mean-filled)
                          - 'age_years': int (e.g., 60)
    Returns:
        str: "HIGH risk of CVD" or "LOW risk of CVD"
    """
    # Create DataFrame from the input dictionary
    df_new = pd.DataFrame([data_dict])

    # Ensure the DataFrame has the exact same columns and order as X used for training
    # Any missing columns in data_dict will be filled with NaN (or a specified fill_value),
    # but it's best practice for data_dict to provide all necessary features.
    df_new = df_new.reindex(columns=X.columns, fill_value=0) # Using 0 as a default for missing features, adapt if needed.

    # Scale the new data using the fitted scaler
    df_new_scaled = sc.transform(df_new)

    # Predict using the Random Forest model
    pred = rf_model.predict(df_new_scaled)[0]

    return "HIGH risk of CVD" if pred == 1 else "LOW risk of CVD"

# Example usage

In [41]:
new_patient = {
    'SEX': 1, 'BIRTHYR': 1965, 'NACCAPOE': 2.0, 'DEMENTED': 0, 'CVHATT': 0.0,
    'HATTMULT': 2.801387, 'CVAFIB': 0.0, 'CVANGIO': 0.0, 'CVBYPASS': 0.0,
    'CVPACDEF': 0.0, 'CVPACE': 0.0, 'CVCHF': 0.0, 'CVANGINA': 0.0,
    'CVHVALVE': 0.0, 'CVOTHR': 0.0, 'MYOINF': 0.0, 'CONGHRT': 0.0,
    'AFIBRILL': 0.0, 'ANGINA': 0.0, 'ANGIOCP': 0.0, 'ANGIOPCI': 0.0,
    'PACEMAKE': 0.0, 'HVALVE': 0.0, 'CBSTROKE': 0.0, 'STROKMUL': 2.801387,
    'NACCSTYR': 2005.0, 'CBTIA': 0.0, 'TIAMULT': 2.801387, 'NACCTIYR': 2005.0,
    'HXSTROKE': 0.0, 'PREVSTK': 0.0, 'STROKE': 0.0, 'STROKIF': 7.128011,
    'STROKDEC': 0.0, 'STKIMAG': 0.0, 'CVDIF': 7.128011, 'VASC': 0.0,
    'VASCIF': 7.128011, 'VASCPS': 0.0, 'VASCPSIF': 7.128011, 'age_years': 58
}
print("Prediction for new patient:", predict_new_patient(new_patient))

Prediction for new patient: HIGH risk of CVD


In [45]:
"LOW risk of CVD"  # ya
"HIGH risk of CVD"


'HIGH risk of CVD'

# New Patient Data Format