In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/Project/Cardiovascular_Cleaned.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 21 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Height_(cm)                   308854 non-null  int64  
 1   Weight_(kg)                   308854 non-null  float64
 2   BMI                           308854 non-null  float64
 3   Alcohol_Consumption           308854 non-null  int64  
 4   Fruit_Consumption             308854 non-null  int64  
 5   Green_Vegetables_Consumption  308854 non-null  int64  
 6   FriedPotato_Consumption       308854 non-null  int64  
 7   Age                           308854 non-null  int64  
 8   Checkup_Encoded               308854 non-null  int64  
 9   General_Health_Encoded        308854 non-null  int64  
 10  Exercise_Encoded              308854 non-null  int64  
 11  Heart_Disease_Encoded         308854 non-null  int64  
 12  Skin_Cancer_Encoded           308854 non-nul

In [None]:
df.Heart_Disease_Encoded.value_counts()


Unnamed: 0_level_0,count
Heart_Disease_Encoded,Unnamed: 1_level_1
0,283883
1,24971


In [None]:
median_diabetes = df['Diabetes_Encoded'].median()
df['Diabetes_Encoded'].fillna(median_diabetes, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Diabetes_Encoded'].fillna(median_diabetes, inplace=True)


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Heart_Disease_Encoded', axis=1)
y = df['Heart_Disease_Encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (216197, 20)
Shape of X_test: (92657, 20)
Shape of y_train: (216197,)
Shape of y_test: (92657,)


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)

Shape of X_resampled: (397434, 20)
Shape of y_resampled: (397434,)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for AdaBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

# Initialize AdaBoost classifier
adaboost = AdaBoostClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(adaboost, param_grid, cv=5, scoring='recall', n_jobs=-1)

# Fit GridSearchCV on the resampled and scaled training data
grid_search.fit(X_resampled_scaled, y_resampled)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best recall score:", grid_search.best_score_)

Best parameters: {'learning_rate': 0.01, 'n_estimators': 50}
Best recall score: 0.8927116197211203


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Get the best AdaBoost model from GridSearchCV
best_adaboost_model = grid_search.best_estimator_

# Make predictions on the scaled test data
y_pred = best_adaboost_model.predict(X_test_scaled)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[48153 37013]
 [ 1802  5689]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.57      0.71     85166
           1       0.13      0.76      0.23      7491

    accuracy                           0.58     92657
   macro avg       0.55      0.66      0.47     92657
weighted avg       0.90      0.58      0.67     92657



In [None]:
import pickle

# Save the best AdaBoost model to a pickle file
filename = 'best_adaboost_model.pkl'
pickle.dump(best_adaboost_model, open(filename, 'wb'))

print(f"Model saved to {filename}")

Model saved to best_adaboost_model.pkl


In [38]:
import pickle

# Assuming 'scaler' is the StandardScaler object you fitted on your training data
# For example, after this line in your training code:
# X_resampled_scaled = scaler.fit_transform(X_resampled)

# Save the scaler to a pickle file
filename = 'scaler.pkl'
pickle.dump(scaler, open(filename, 'wb'))

print(f"Scaler saved to {filename}")

Scaler saved to scaler.pkl


In [39]:
!pip freeze > requirements.txt

In [40]:
import streamlit as st
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the trained model
try:
    with open('best_adaboost_model.pkl', 'rb') as f:
        model = pickle.load(f)
except FileNotFoundError:
    st.error("Model file 'best_adaboost_model.pkl' not found. Please ensure it's in the same directory.")
    st.stop() # Stop the app if the model file is not found

# Load the scaler used during training
try:
    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
except FileNotFoundError:
    st.error("Scaler file 'scaler.pkl' not found. Please ensure it's in the same directory.")
    st.stop()

# Define feature names - make sure these match the order used during training
# You can get this from the columns of your X_train or X_resampled DataFrame
feature_names = ['Height_(cm)', 'Weight_(kg)', 'BMI', 'Alcohol_Consumption',
                 'Fruit_Consumption', 'Green_Vegetables_Consumption',
                 'FriedPotato_Consumption', 'Age', 'Checkup_Encoded',
                 'General_Health_Encoded', 'Exercise_Encoded', 'Skin_Cancer_Encoded',
                 'Other_Cancer_Encoded', 'Depression_Encoded', 'Arthritis_Encoded',
                 'Diabetes_Encoded', 'Smoking_History_Encoded', 'Female', 'Male',
                 'BMI_Category']


st.title("Cardiovascular Disease Prediction")

st.write("""
Enter the patient's information below to predict the likelihood of heart disease.
""")

# Create input fields for each feature
# You'll need to adjust the min_value, max_value, and step for each input based on your data's range
height = st.number_input("Height (cm)", min_value=50, max_value=300, value=170)
weight = st.number_input("Weight (kg)", min_value=10.0, max_value=500.0, value=70.0, step=0.1)
bmi = st.number_input("BMI", min_value=10.0, max_value=100.0, value=25.0, step=0.1)
alcohol_consumption = st.number_input("Alcohol Consumption (drinks per week)", min_value=0, max_value=100, value=0)
fruit_consumption = st.number_input("Fruit Consumption (servings per day)", min_value=0, max_value=100, value=30)
green_vegetables_consumption = st.number_input("Green Vegetables Consumption (servings per day)", min_value=0, max_value=100, value=30)
fried_potato_consumption = st.number_input("Fried Potato Consumption (servings per week)", min_value=0, max_value=100, value=0)
age = st.number_input("Age", min_value=18, max_value=120, value=50)
checkup = st.selectbox("Last Checkup", options=[0, 1, 2, 3, 4], format_func=lambda x: ["Within past year", "Within past 2 years", "Within past 5 years", "5+ years ago", "Never"][x])
general_health = st.selectbox("General Health", options=[0, 1, 2, 3, 4], format_func=lambda x: ["Excellent", "Very Good", "Good", "Fair", "Poor"][x])
exercise = st.selectbox("Exercise", options=[0, 1], format_func=lambda x: ["No", "Yes"][x])
skin_cancer = st.selectbox("Skin Cancer", options=[0, 1], format_func=lambda x: ["No", "Yes"][x])
other_cancer = st.selectbox("Other Cancer", options=[0, 1], format_func=lambda x: ["No", "Yes"][x])
depression = st.selectbox("Depression", options=[0, 1], format_func=lambda x: ["No", "Yes"][x])
arthritis = st.selectbox("Arthritis", options=[0, 1], format_func=lambda x: ["No", "Yes"][x])
diabetes = st.selectbox("Diabetes", options=[0, 1], format_func=lambda x: ["No", "Yes"][x]) # Assuming 0 for no, 1 for yes based on your data info
smoking_history = st.selectbox("Smoking History", options=[0, 1], format_func=lambda x: ["No", "Yes"][x])
gender = st.selectbox("Gender", options=["Female", "Male"])

# Encode gender
female = 1 if gender == "Female" else 0
male = 1 if gender == "Male" else 0

# Calculate BMI Category based on BMI input
# Ensure these ranges match how you categorized BMI in your training data
if bmi < 18.5:
    bmi_category = 0  # Underweight
elif 18.5 <= bmi < 25:
    bmi_category = 1  # Healthy Weight
elif 25 <= bmi < 30:
    bmi_category = 2  # Overweight
else:
    bmi_category = 3  # Obese


# Create a button to make a prediction
if st.button("Predict"):
    # Prepare the input data as a pandas DataFrame
    user_input = pd.DataFrame([[height, weight, bmi, alcohol_consumption, fruit_consumption,
                                green_vegetables_consumption, fried_potato_consumption, age,
                                checkup, general_health, exercise, skin_cancer, other_cancer,
                                depression, arthritis, diabetes, smoking_history, female, male,
                                bmi_category]],
                              columns=feature_names) # Use the defined feature names

    # Scale the user input using the loaded scaler
    user_input_scaled = scaler.transform(user_input)

    # Make prediction
    prediction_proba = model.predict_proba(user_input_scaled)[:, 1] # Probability of Heart Disease (class 1)

    st.subheader("Prediction Result:")
    st.write(f"Probability of Heart Disease: **{prediction_proba[0]:.2f}**")

    # You can add conditional messages based on the probability
    if prediction_proba[0] > 0.5: # You can adjust this threshold
        st.warning("Based on the provided information, there is a higher likelihood of heart disease.")
    else:
        st.info("Based on the provided information, there is a lower likelihood of heart disease.")

