In [5]:
## For some arithmetic and Matrix Operations
import numpy as np

## Dataframe Manipulation
import pandas as pd

## For Visualization
import matplotlib.pyplot as plt

## For Visualization too
#import seaborn as sns

## Creating Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


## Creating a function transformer
from sklearn.preprocessing import FunctionTransformer

## For Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector


## For preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

## For missing values
from sklearn.impute import SimpleImputer

## Getting the recall score on our train set
from sklearn.metrics import recall_score

## Getting the accuracy score on train set
from sklearn.metrics import accuracy_score

## Getting the classification report from our train set
from sklearn.metrics import classification_report

## Cross validation
from sklearn.model_selection import cross_val_score

## Gridsearch CV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV




## Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


## preprocessing

In [6]:
df = pd.read_csv('CVD_cleaned.csv')
def head(df,shape_only=False):
    print(df.shape)

    if shape_only:
        return
    else:
        return df.head()
## Viewing the dataframe and shape
head(df,shape_only=False) 

(308854, 19)


Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [7]:
## Creating numerical and categorical columns
numerical = df.select_dtypes(include=['float64']).columns.sort_values()
categorical = df.select_dtypes(include=['object']).columns.sort_values()

# Printing the length and names of categorical and numerical columns
print(f'There are {len(categorical)} Categorical variables:')
print(categorical.tolist())  # Print the names of categorical columns

print(f'There are {len(numerical)} Numerical variables:')
print(numerical.tolist())  # Print the names of numerical columns

There are 12 Categorical variables:
['Age_Category', 'Arthritis', 'Checkup', 'Depression', 'Diabetes', 'Exercise', 'General_Health', 'Heart_Disease', 'Other_Cancer', 'Sex', 'Skin_Cancer', 'Smoking_History']
There are 7 Numerical variables:
['Alcohol_Consumption', 'BMI', 'FriedPotato_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'Height_(cm)', 'Weight_(kg)']


In [8]:
# Check the unique values in the Checkup column
print(df['Checkup'].unique())


['Within the past 2 years' 'Within the past year' '5 or more years ago'
 'Within the past 5 years' 'Never']


In [9]:
# Normalize the Checkup column to ensure it matches the mapping keys
df['Checkup'] = df['Checkup'].str.strip().str.lower()

# Update the mapping to match the normalized format
checkup_mapping = {
    "within the past year": 1,
    "within the past 2 years": 2,
    "within the past 5 years": 3,
    "5 or more years ago": 4,
    "never": 5
}

# Apply the mapping again
df['Checkup'] = df['Checkup'].map(checkup_mapping)

# Verify the changes
print(df[['Checkup']].head())


   Checkup
0        2
1        1
2        1
3        1
4        1


In [10]:
# Check if there are any unmapped values
unmapped_values = df[df['Checkup'].isna()]['Checkup'].unique()
print(unmapped_values)


[]


In [11]:
# Save the updated DataFrame to a new CSV file
df.to_csv('CVD_cleaned_mapped.csv', index=False)


In [12]:
yes_no_mapping = {
    "Yes": 1,
    "No": 0
}


In [13]:
sex_mapping = {
    "Male": 1,
    "Female": 0
}


In [14]:
# Create a mapping for the Diabetes column
diabetes_mapping = {
    "No": 0.0,
    "Yes": 1.0,
    "No, pre-diabetes or borderline diabetes": 0.5,
    "Yes, but female told only during pregnancy": 0.7
}

# Apply the mapping
df['Diabetes'] = df['Diabetes'].map(diabetes_mapping)

# Verify the changes
print(df['Diabetes'].head())


0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Diabetes, dtype: float64


In [15]:
# Create a mapping for the Diabetes column
general_mapping = {
    "Very Good": 1,
    "Good": 2,
    "Excellent": 3,
    "Fair": 4,
    "Poor":5
}

# Apply the mapping
df['General_Health'] = df['General_Health'].map(general_mapping)

# Verify the changes
print(df['General_Health'].head())


0    5
1    1
2    1
3    5
4    2
Name: General_Health, dtype: int64


In [16]:
# Check the unique values in the Checkup column
print(df['Age_Category'].unique())


['70-74' '60-64' '75-79' '80+' '65-69' '50-54' '45-49' '18-24' '30-34'
 '55-59' '35-39' '40-44' '25-29']


In [17]:
# Create a mapping for the Diabetes column
age_mapping = {
    "65-69": 1,
    "60-64": 2,
    "70-74": 3,
    "55-59": 4,
    "50-54":5,
    "75-79":6,
    "80+":7,
    "45-49":8,
    "18-24":9,
    "30-34":10,
    "35-39":11,
    "40-44":12,
    "25-29":13

}

# Apply the mapping
df['Age_Category'] = df['Age_Category'].map(age_mapping)

# Verify the changes
print(df['Age_Category'].head())


0    3
1    3
2    2
3    6
4    7
Name: Age_Category, dtype: int64


In [18]:
# Apply the Yes/No mapping to relevant columns
yes_no_columns = ['Arthritis', 'Depression', 'Exercise', 'Heart_Disease', 
                  'Other_Cancer', 'Skin_Cancer', 'Smoking_History']

for column in yes_no_columns:
    df[column] = df[column].map(yes_no_mapping)

# Apply the Male/Female mapping to the 'Sex' column
df['Sex'] = df['Sex'].map(sex_mapping)

# Verify the changes
print(df[yes_no_columns + ['Sex']].head())


   Arthritis  Depression  Exercise  Heart_Disease  Other_Cancer  Skin_Cancer  \
0          1           0         0              0             0            0   
1          0           0         0              1             0            0   
2          0           0         1              0             0            0   
3          0           0         1              1             0            0   
4          0           0         0              0             0            0   

   Smoking_History  Sex  
0                1    0  
1                0    0  
2                0    0  
3                0    1  
4                1    1  


In [19]:
df.to_csv('CVD_cleaned_mapped_full.csv', index=False)


In [20]:
# Check for missing values in the DataFrame
missing_values = df.isnull().sum()
print(missing_values)


General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64


In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

# Separate the input features and target column
input_features = df.drop(columns=['Heart_Disease'])  # Replace with actual target column
target = df['Heart_Disease']

# Fit the scaler only on the input features
scaler = StandardScaler()
input_features_scaled = pd.DataFrame(scaler.fit_transform(input_features), columns=input_features.columns)

# Add the target column back to the scaled dataframe
df_scaled = pd.concat([input_features_scaled, target.reset_index(drop=True)], axis=1)

# Save the scaler for use in Gradio
joblib.dump(scaler, 'scaler.joblib')

# Save the scaled data (with target column included)
df_scaled.to_csv('CVD_cleaned_scaled_standard.csv', index=False)

# Now load the scaled data
df_scaled = pd.read_csv('CVD_cleaned_scaled_standard.csv')

# Define features and target for the model
X = df_scaled.drop(columns=['Heart_Disease'])  # Features
y = df_scaled['Heart_Disease']  # Target


In [22]:
# Check class distribution
print(df['Heart_Disease'].value_counts())


Heart_Disease
0    283883
1     24971
Name: count, dtype: int64


In [23]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


# Ensure the target column contains integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)



model = LogisticRegression(class_weight='balanced')


# Train the model again
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))

Accuracy: 0.7187838953554256
              precision    recall  f1-score   support

           0       0.97      0.72      0.82     56774
           1       0.18      0.71      0.29      4997

    accuracy                           0.72     61771
   macro avg       0.57      0.72      0.56     61771
weighted avg       0.90      0.72      0.78     61771



In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[40835 15939]
 [ 1432  3565]]


In [26]:
from joblib import dump, load

# Save the trained model
dump(model, 'heart_disease_model.joblib')


['heart_disease_model.joblib']

## Find the threshold with the best trade-off 

In [27]:
from sklearn.metrics import precision_recall_curve
import numpy as np

# Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]

# Compute precision-recall pairs for different probability thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_proba, pos_label=3)

# Find the threshold with the best trade-off (you can adjust this as needed)
best_threshold = thresholds[np.argmax(recall - (1 - precision))]
print(f'Best Threshold: {best_threshold}')

# Apply the threshold to make new predictions
y_pred_new = (y_proba >= best_threshold).astype(int)

# Evaluate the model with the new threshold
from sklearn.metrics import accuracy_score, classification_report

print(f'Accuracy: {accuracy_score(y_test, y_pred_new)}')
print(classification_report(y_test, y_pred_new))


Best Threshold: 0.011444575856354031
Accuracy: 0.08089556588042933
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     56774
           1       0.08      1.00      0.15      4997

    accuracy                           0.08     61771
   macro avg       0.04      0.50      0.07     61771
weighted avg       0.01      0.08      0.01     61771



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Find the best threshold based on the F1 score

In [28]:
from sklearn.metrics import f1_score

# Compute precision-recall pairs
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Compute F1 score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)

# Find the best threshold based on the F1 score
best_threshold = thresholds[np.argmax(f1_scores)]
print(f'Best Threshold based on F1-score: {best_threshold}')

# Apply the new threshold
y_pred_new = (y_proba >= best_threshold).astype(int)

# Evaluate the model with the new threshold
print(f'Accuracy: {accuracy_score(y_test, y_pred_new)}')
print(classification_report(y_test, y_pred_new))


Best Threshold based on F1-score: 0.6932249294561542
Accuracy: 0.8536691975198718
              precision    recall  f1-score   support

           0       0.95      0.89      0.92     56774
           1       0.26      0.44      0.33      4997

    accuracy                           0.85     61771
   macro avg       0.60      0.67      0.62     61771
weighted avg       0.89      0.85      0.87     61771



## decision tree model

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Ensure the target column contains integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Initialize the Decision Tree Classifier
model1 = DecisionTreeClassifier(class_weight='balanced', random_state=42)

# Train the model
model1.fit(X_train, y_train)

# Make predictions
y_pred = model1.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


Accuracy: 0.8731929222450665
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     56774
           1       0.20      0.19      0.20      4997

    accuracy                           0.87     61771
   macro avg       0.57      0.56      0.56     61771
weighted avg       0.87      0.87      0.87     61771



In [30]:
from joblib import dump, load

# Save the trained model
dump(model1, 'heart_disease_modelDT.joblib')


['heart_disease_modelDT.joblib']

## scalling

In [31]:
# Ensure the target columns are integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# 2. Initialize and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Save the scaler for later use
dump(scaler, 'scaler.joblib')

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

# 3. Train the Decision Tree model
model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

# Save the trained Decision Tree model
dump(model, 'heart_disease_modelDT.joblib')

# 4. Evaluate the model
y_pred = model.predict(X_test_scaled)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred))


Accuracy: 0.8732091110715384
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     56774
           1       0.20      0.19      0.20      4997

    accuracy                           0.87     61771
   macro avg       0.57      0.56      0.56     61771
weighted avg       0.87      0.87      0.87     61771



# gradio app

In [32]:
import numpy as np
import gradio as gr
from joblib import load

# Load the trained Decision Tree model
model = load('heart_disease_modelDT.joblib')
scaler = load('scaler.joblib')  # Load the saved scaler

# Optimized threshold based on F1-score
best_threshold = 0.5  # You can adjust this threshold as needed

def preprocess_input(
    general_health,
    checkup,                
    exercise,               
    sex,
    age_category,
    height_cm,
    weight_kg,
    bmi,
    smoking_history,
    alcohol_consumption,
    fruit_consumption,
    green_vegetables_consumption,
    fried_potato_consumption,
    arthritis,
    depression,
    diabetes,
    other_cancer,
    skin_cancer
):
    
    # Correct mappings for each categorical input
    general_health_mapping = {"Excellent": 3, "Very Good": 1, "Good": 2, "Fair": 4, "Poor": 5}
    yes_no_mapping = {"Yes": 1, "No": 0}
    sex_mapping = {"Male": 1, "Female": 0}
    smoking_history_mapping = {"Never Smoked": 0, "Former Smoker": 1, "Current Smoker": 2}
    alcohol_mapping = {"Never": 0, "Occasionally": 1, "Regularly": 2}
    checkup_mapping = {
        "Within the past year": 1,
        "Within the past 2 years": 2,
        "Within the past 5 years": 3,
        "5 or more years ago": 4,
        "Never": 5
    }

    # Apply mappings
    general_health = general_health_mapping[general_health]
    exercise = yes_no_mapping[exercise]
    sex = sex_mapping[sex]
    smoking_history = smoking_history_mapping[smoking_history]
    alcohol_consumption = alcohol_mapping[alcohol_consumption]
    checkup = checkup_mapping[checkup]
    arthritis = yes_no_mapping[arthritis]
    depression = yes_no_mapping[depression]
    diabetes = yes_no_mapping[diabetes]
    other_cancer = yes_no_mapping[other_cancer]
    skin_cancer = yes_no_mapping[skin_cancer]

    # Combine into a single feature array (with 19 features as expected)
    input_data = np.array([[
        general_health, exercise, sex, age_category, height_cm, weight_kg, bmi, smoking_history,
        alcohol_consumption, fruit_consumption, green_vegetables_consumption, fried_potato_consumption,
        arthritis, checkup, depression, diabetes, other_cancer, skin_cancer
    ]])

    # Debug: Print the shape of input data
    print(f"Input Data Shape (before scaling): {input_data.shape}")

    # Load the scaler and scale the input data
    scaler = load('scaler.joblib')
    input_data_scaled = scaler.transform(input_data)

    print(f"Input Data Scaled: {input_data_scaled}")
    return input_data_scaled

def predict_health_condition(
    general_health,
    checkup,
    exercise,
    sex,
    age_category,
    height_cm,
    weight_kg,
    bmi,
    smoking_history,
    alcohol_consumption,
    fruit_consumption,
    green_vegetables_consumption,
    fried_potato_consumption,
    arthritis,
    depression,
    diabetes,
    other_cancer,
    skin_cancer
):
    # Preprocess and scale the input
    input_data_scaled = preprocess_input(
        general_health, checkup, exercise, sex, age_category, height_cm, weight_kg, bmi,
        smoking_history, alcohol_consumption, fruit_consumption, green_vegetables_consumption,
        fried_potato_consumption, arthritis, depression, diabetes, other_cancer, skin_cancer
    )

    # Make prediction
    probability = model.predict_proba(input_data_scaled)[0, 1]  # Get the probability of having the disease
    print(f"Raw Model Probability: {probability}")
    # Apply the optimized threshold
    prediction = (probability >= best_threshold).astype(int)

    # Determine risk level based on probability
    risk_level = "High Risk" if prediction == 1 else "Low Risk"

    # Debug output
    print(f"Model Prediction: {prediction}")
    print(f"Model Probability: {probability}")

    return f"{risk_level} with a {probability:.2%} chance of having heart disease."

# Create the Gradio interface
def create_gradio_interface():
    with gr.Blocks() as demo:
        gr.Markdown("# Health Monitoring System")
        gr.Markdown("Enter your health details to get insights into potential health risks.")

        with gr.Row():
            general_health = gr.Dropdown(
                choices=["Excellent", "Very Good", "Good", "Fair", "Poor"],
                label="General Health"
            )
            checkup = gr.Dropdown(
                choices=[
                    "Within the past year",
                    "Within the past 2 years",
                    "Within the past 5 years",
                    "5 or more years ago",
                    "Never"
                ],
                label="Regular Medical Checkup"
            )

        with gr.Row():
            exercise = gr.Radio(
                choices=["Yes", "No"],
                label="Regular Exercise"
            )
            sex = gr.Radio(
                choices=["Male", "Female"],
                label="Sex"
            )
            age_category = gr.Slider(
                minimum=0,
                maximum=100,
                step=1,
                label="Age"
            )

        with gr.Row():
            height_cm = gr.Number(label="Height (cm)", value=170)
            weight_kg = gr.Number(label="Weight (kg)", value=70)
            bmi = gr.Number(label="BMI", value=25)

        with gr.Row():
            smoking_history = gr.Dropdown(
                choices=["Never Smoked", "Former Smoker", "Current Smoker"],
                label="Smoking History"
            )
            alcohol_consumption = gr.Dropdown(
                choices=["Never", "Occasionally", "Regularly"],
                label="Alcohol Consumption"
            )

        with gr.Row():
            fruit_consumption = gr.Slider(
                minimum=0,
                maximum=14,
                step=1,
                label="Fruit Consumption (per week)"
            )
            green_vegetables_consumption = gr.Slider(
                minimum=0,
                maximum=14,
                step=1,
                label="Green Vegetables Consumption (per week)"
            )
            fried_potato_consumption = gr.Slider(
                minimum=0,
                maximum=14,
                step=1,
                label="Fried Potato Consumption (per week)"
            )

        with gr.Row():
            arthritis = gr.Radio(
                choices=["Yes", "No"],
                label="Arthritis"
            )
            depression = gr.Radio(
                choices=["Yes", "No"],
                label="Depression"
            )
            diabetes = gr.Radio(
                choices=["Yes", "No"],
                label="Diabetes"
            )
            other_cancer = gr.Radio(
                choices=["Yes", "No"],
                label="Other Cancer"
            )
            skin_cancer = gr.Radio(
                choices=["Yes", "No"],
                label="Skin Cancer"
            )

        # Define the predict button and output text box
        predict_button = gr.Button("Predict Health Risk")
        output = gr.Textbox(label="Prediction")

        # The predict_button.click function goes here
        predict_button.click(
            fn=predict_health_condition,
            inputs=[
                general_health,
                checkup,
                exercise,
                sex,
                age_category,
                height_cm,
                weight_kg,
                bmi,
                smoking_history,
                alcohol_consumption,
                fruit_consumption,
                green_vegetables_consumption,
                fried_potato_consumption,
                arthritis,
                depression,
                diabetes,
                other_cancer,
                skin_cancer
            ],
            outputs=output
        )

    return demo

# Launch the Gradio interface
if __name__ == "__main__":
    interface = create_gradio_interface()
    interface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://b660b30d0f42eca02d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Input Data Shape (before scaling): (1, 18)
Input Data Scaled: [[ 5.00227274e+00 -5.68978468e-04  1.00021401e+00  8.28760331e+01
   1.70007592e+02  8.99818755e+01  2.89735217e+01  1.99965162e+00
   1.99892418e+00 -7.09308990e-04 -1.67111278e-03  8.97760678e+00
   9.96082079e-01  5.00097276e+00  9.99639787e-01  1.00066170e+00
   1.00068271e+00  1.00502356e+00]]
Raw Model Probability: 0.0
Model Prediction: 0
Model Probability: 0.0


