<a href="https://colab.research.google.com/github/vincewang7/BTC2120_FinalProject/blob/main/BTC2120_Final_Report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [None]:
## Module Import
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, mean_squared_error, r2_score, roc_curve
from sklearn.preprocessing import StandardScaler, scale, OrdinalEncoder, LabelEncoder
import sklearn.linear_model as skl_lm
import statsmodels.api as sm
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings('ignore')

Importing Data

In [None]:
cvd_df = pd.read_csv('CVD_cleaned.csv')

#Inputting the dependent variable as the last column in the dataframe
cvd_df['Heart_Disease'] = cvd_df.pop('Heart_Disease')

# view dataframe
cvd_df.head()

Exploratory Data Analysis and Visualization

In [None]:
# Descriptive statistics of dataframe
cvd_df.describe()

Encoding numerical values to categorical values

*   Exercise: 1 = Yes; 0 = No
*   Skin_Cancer: 1 = Yes; 0 = No
*   Other_Cancer: 1 = Yes; 0 = No
*   Depression: 1 = Yes; 0 = No
*   Arthritis: 1 = Yes; 0 = No
*   Sex: 1 = Male; 0 = Female
*   Age_Category: 12 = 80+; 11 = 75-79;10 = 70-74; 9 = 65-69; 8 = 60-64; 7 = 55-59; 6 = 50-54; 5 = 45-49; 4 = 40-44; 3 = 35-39; 2 = 30-34; 1 = 25-29; 0 = 18-24
*   Smoking_History: 1 = Yes; 0 = No
*   Heart_Disease: 1 = Yes; 0 = No
*   Diabetes: 1 = Yes; 0 = No

Checkup was coded as 5 levels:
*    Never: 0
*    5 or more years ago: 1
*    Within the past 5 years: 2
*    Within the past 2 years: 3
*    Within the past year: 4
        
        
General_Health was coded as 5 levels
*   Poor = 0
*   Fair = 1
*   Good = 2
*   Very Good = 3
*   Excellent = 4

In [None]:
#Categories that don't require specific encoding orders
cat_cols = ['Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis', 'Sex', 'Age_Category', 'Smoking_History', 'Heart_Disease']

def func_labelencoder(list1,features):
    le = LabelEncoder()
    features[list1] = le.fit_transform(features[list1].astype(str))

cvd_df2 = cvd_df

for i in cat_cols:
    func_labelencoder(i,cvd_df2)

# Adjust general health variable
def translate_general_health(health_series):
    # Dictionary mapping string labels to numerical values
    health_translation = {
        "Poor": 0,
        "Fair": 1,
        "Good": 2,
        "Very Good": 3,
        "Excellent": 4
    }

    return health_series.apply(lambda x: health_translation.get(x, None))

cvd_df2["General_Health"] = translate_general_health(cvd_df["General_Health"])

# Adjust checkup vairable
def translate_checkup(checkup_series):
    # Dictionary mapping string labels to numerical values
    checkup_translation = {
        "Never": 0,
        "5 or more years ago": 1,
        "Within the past 5 years": 2,
        "Within the past 2 years": 3,
        "Within the past year": 4
    }

    # Apply the translation to each element in the Series
    return checkup_series.apply(lambda x: checkup_translation.get(x, None))

cvd_df2["Checkup"] = translate_checkup(cvd_df["Checkup"])

# Adjust diabetes variable

def translate_diabetes(diabetes_series):
    # Function to apply regex matching and translation
    def regex_translate(value):
        # Using regex to search for "Yes" or "No", ignoring case
        if re.search("Yes", value, re.IGNORECASE):
            return 1
        elif re.search("No", value, re.IGNORECASE):
            return 0
        else:
            return None

    return diabetes_series.apply(regex_translate)

cvd_df2["Diabetes"] = translate_diabetes(cvd_df["Diabetes"])

In [None]:
# Descriptive Statistics after adjusting variables
cvd_df2.describe()

In [None]:
# EDA, frequency plots (categorical) and box and whisker (numerical) plots for variables
from itertools import product
import matplotlib.patheffects as path_effects

#displaying categorical variables through frequency plots, and continuous variables through box-and-whisker plots
countplot_cols = ['General_Health','Checkup', 'Age_Category', 'Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis','Sex', 'Smoking_History']
boxplot_cols = ['General_Health', 'Checkup', 'Height_(cm)', 'Weight_(kg)', 'Age_Category', 'BMI', 'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption', 'FriedPotato_Consumption']

#creating frequency plots for categorical variables
for i, column in enumerate(countplot_cols):
    ax = sns.countplot(x=column, hue = 'Heart_Disease', data=cvd_df2)
    groups = cvd_df2[column].unique()
    groups.sort()
    proportions = cvd_df2.groupby(column)['Heart_Disease'].value_counts(normalize=True)
    for c in ax.containers:
        labels = [f'{proportions.loc[g, ax.containers.index(c)]:.1%}' for g in groups]
        ax.bar_label(c, labels)
    ax.set_title(f'Incidence of Heart Disease by {column.replace("_", " ")}')
    plt.show()

#creating box-and-whisker plots for continuous variables
for i, column in enumerate(boxplot_cols):
    ax = sns.boxplot(x='Heart_Disease', y=column, data=cvd_df2)
    medians = cvd_df2.groupby(['Heart_Disease'])[column].median()
    vertical_offset = cvd_df2[column].median() * -0.05
    for xtick in ax.get_xticks():
        text = ax.text(xtick,medians[xtick] + vertical_offset,medians[xtick], horizontalalignment='center',
                       size='x-small', color='white', weight='semibold', bbox=dict(facecolor='#445A64'))
    ax.set_title(f'{column.replace("_", " ")} by Heart Disease status')
    plt.show()

Checking for correlation

In [None]:
# Calculate correlation matrix
corr_matrix = cvd_df2.drop(columns=['Heart_Disease']).corr()

# Create correlation heatmap
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
plt.figure(figsize=(10, 8))  # Adjust size as needed
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', linewidths=.5)
plt.title("Correlation Heatmap")
plt.show()

SyntaxError: invalid syntax (<ipython-input-1-0e7a625c90c8>, line 2)

### MODEL 1: Univariate Linear Regressions

In [None]:
#Defining X and Y
X = cvd_df2.drop(columns=['Heart_Disease'])
y = cvd_df2['Heart_Disease']

In [None]:
#Changing all values to have the same data tpye - converting all columns to integers
#First - checking the data tpyes in the data set shows that there are different tpyes, which is throwing an error.
print("Data types in X")
print(X.dtypes)
print("Data types in y")
print(y.dtypes)

Data types in X


NameError: name 'X' is not defined

In [None]:
#Conducting univariate logistic regression using Logit over every independent variable
y_train = y

for column in X:
  X_train = sm.add_constant(cvd_df2[[column]])
  est = sm.Logit(y_train, X_train).fit()
  print(est.summary())
  X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
  model = LogisticRegression(max_iter=1000)
  model.fit(X_train, y_train)

  # Predicting probabilities for the test set
  y_prob = model.predict_proba(X_test)[:, 1] # Get the probabilities of the positive class

  # Calculating the AUC
  auc = roc_auc_score(y_test, y_prob)
  print(f"AUC: {auc}")

  # Predicting labels for the test set
  y_pred = model.predict(X_test)

  # Calculating the misclassification rate
  misclassification_rate = 1 - accuracy_score(y_test, y_pred)
  print(f"Misclassification Rate: {misclassification_rate}")

NameError: name 'y' is not defined

#### Note for the next models
Each model is trained using a 70/30 split and a constant random state

### MODEL 2A: Multivariate Linear Regression

In [None]:
X = cvd_df2.drop(columns=['Heart_Disease'])
y = cvd_df2['Heart_Disease']

est_mv = sm.Logit(y,X).fit()
print(est_mv.summary())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predicting probabilities for the test set
y_prob = model.predict_proba(X_test)[:, 1] # Get the probabilities of the positive class

# Calculating the AUC
auc = roc_auc_score(y_test, y_prob)
print(f"AUC: {auc}")

# Predicting labels for the test set
y_pred = model.predict(X_test)

# Calculating the misclassification rate
misclassification_rate = 1 - accuracy_score(y_test, y_pred)
print(f"Misclassification Rate: {misclassification_rate}")

**Significant Variables** : General_Health, Checkup, Skin_Cancer, Other_Cancer, Depression, Diabetes, Arthritis, Sex,
Age_Category, Height_(cm), Weight_(kg), BMI, Smoking_History, Alcohol_Consumption

**Insignificant Variables** : Exercise, Fruit_Consumption, Green_Vegetables_Consumption, FriedPotato_Consumption


In [None]:
# Multivariate regression using only significant varialbes from the multivariate regression above
X = cvd_df2.drop(columns=['Heart_Disease', 'Exercise', 'Fruit_Consumption', 'Green_Vegetables_Consumption',
                          'FriedPotato_Consumption'])
y = cvd_df2['Heart_Disease']

est_mv = sm.Logit(y,X).fit()
print(est_mv.summary())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predicting probabilities for the test set
y_prob = model.predict_proba(X_test)[:, 1] # Get the probabilities of the positive class

# Calculating the AUC
auc = roc_auc_score(y_test, y_prob)
print(f"AUC: {auc}")

# Predicting labels for the test set
y_pred = model.predict(X_test)

# Calculating the misclassification rate
misclassification_rate = 1 - accuracy_score(y_test, y_pred)
print(f"Misclassification Rate: {misclassification_rate}")

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob)

# Plotting the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### MODEL 2B: Multivariate Regression (remove multicollinearity)

Removing correlated variables, age and height create BMI, just remove them and use BMI since it standardizes the distribution

In [None]:
X = cvd_df2.drop(columns=['Heart_Disease', 'Exercise', 'Fruit_Consumption', 'Green_Vegetables_Consumption',
                          'FriedPotato_Consumption', 'Height_(cm)', 'Weight_(kg)'])
y = cvd_df2['Heart_Disease']

est_mv = sm.Logit(y,X).fit()
print(est_mv.summary())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predicting probabilities for the test set
y_prob = model.predict_proba(X_test)[:, 1] # Get the probabilities of the positive class

# Calculating the AUC
auc = roc_auc_score(y_test, y_prob)
print(f"AUC: {auc}")

# Predicting labels for the test set
y_pred = model.predict(X_test)

# Calculating the misclassification rate
misclassification_rate = 1 - accuracy_score(y_test, y_pred)
print(f"Misclassification Rate: {misclassification_rate}")

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob)

# Plotting the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### MODEL 2C: Multivariate Regression + Interactive Terms
As seen in the correlation matrix and based on prior domain knowledge, height is very dependant on the persons sex (male v female). Using an interaction term between the variables Sex and Height could prove useful for the analysis. Additionally the following interactive terms were also generated:
*   General_Health x Checkup
*   Skin_Cancer x Other Cancer
*   Smoking History x Alcohol Consumption

In [None]:
cvd_df3 = cvd_df2
cvd_df3['Sex*Height'] = cvd_df3['Sex'] * cvd_df3['Height_(cm)']
cvd_df3['General_Health*Checkup'] = cvd_df3['General_Health'] * cvd_df3['Checkup']
cvd_df3['Smoking_History*Alcohol_Consumption'] = cvd_df3['Alcohol_Consumption'] * cvd_df3['Smoking_History']

X = cvd_df3.drop(columns=['Heart_Disease', 'Exercise', 'Fruit_Consumption', 'Green_Vegetables_Consumption',
                          'FriedPotato_Consumption'])
y = cvd_df3['Heart_Disease']

est_mv = sm.Logit(y,X).fit()
print(est_mv.summary())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predicting probabilities for the test set
y_prob = model.predict_proba(X_test)[:, 1] # Get the probabilities of the positive class

# Calculating the AUC
auc = roc_auc_score(y_test, y_prob)
print(f"AUC: {auc}")

# Predicting labels for the test set
y_pred = model.predict(X_test)

# Calculating the misclassification rate
misclassification_rate = 1 - accuracy_score(y_test, y_pred)
print(f"Misclassification Rate: {misclassification_rate}")

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob)

# Plotting the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### MODEL 3: Lasso Regularization
Lasso is a penalty technique that can be used in regression to help reduce the weight/effect of unwanted variables while keeping them in the model. This can potentially provide higher classification by choosing more meaningful predictors when their are many features to choose from

In [None]:
# Preparing the data
X = cvd_df2.drop(columns=['Heart_Disease'])
y = cvd_df2['Heart_Disease']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# It's often a good idea to scale your data for regularization to work properly
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Setting up the model with L1 penalty
model = LogisticRegression(penalty='l1', solver='liblinear', C=0.013, max_iter=1000)

# Training the model
model.fit(X_train_scaled, y_train)

# Making predictions
y_prob = model.predict_proba(X_test_scaled)[:, 1] # Probabilities
y_pred = model.predict(X_test_scaled) # Predicted classes

# Evaluating the model
auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
misclassification_rate = 1 - accuracy

print(f"AUC: {auc}")
print(f"Misclassification Rate: {misclassification_rate}")


# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob)

# Plotting the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()


# Examine the coefficients
coefficients = pd.DataFrame(model.coef_.flatten(), index=X.columns, columns=['Coefficient']).sort_values(by='Coefficient', ascending=False)
print(coefficients)

AUC: 0.8368007209816706
Accuracy: 0.918991549478183
Misclassification Rate: 0.08100845052181704
                              Coefficient
Age_Category                     0.947643
Sex                              0.398707
Smoking_History                  0.189377
Diabetes                         0.176747
Checkup                          0.149512
Arthritis                        0.119240
Depression                       0.096978
Skin_Cancer                      0.030511
BMI                              0.014815
Other_Cancer                     0.010548
Green_Vegetables_Consumption     0.007460
Fruit_Consumption                0.000000
FriedPotato_Consumption          0.000000
Weight_(kg)                      0.000000
Exercise                        -0.009768
Height_(cm)                     -0.030109
Alcohol_Consumption             -0.066397
General_Health                  -0.596210
