Data Import and Cleaning

In [None]:
health_data = read.csv("health_data.csv")

library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(fastDummies)
library(reshape2)
library(randomForest)
library(ranger)
library(nnet)
library(speedglm)

head(health_data)
summary(health_data)



health_data= health_data %>% separate(BloodPressure, c("BloodPressureNumerator","BloodPressureDenominator"),"/")

health_data$`BloodPressureNumerator` <- as.numeric(as.character(health_data$`BloodPressureNumerator`))
health_data$`BloodPressureDenominator` <- as.numeric(as.character(health_data$`BloodPressureDenominator`))

health_data %>% summarise_all(~sum(is.na(.)))



numeric_data = health_data[,unlist(lapply(health_data, is.numeric))]
cat_data = health_data[,unlist(lapply(health_data, is.character))]

health_data_clean = health_data

health_data_clean$Gender = factor(health_data_clean$Gender,level = c('Male','Female',"Other"))
health_data_clean$SmokingStatus = factor(health_data_clean$SmokingStatus,levels = c('Never','Former','Current'))
health_data_clean$AlcoholConsumption = factor(health_data_clean$AlcoholConsumption,levels = c('Never','Occasionally','Regularly'))
health_data_clean$ExerciseFrequency = factor(health_data_clean$ExerciseFrequency,levels = c('Never','Sometimes','Often','Rarely'))
health_data_clean$PhysicalActivityLevel = factor(health_data_clean$PhysicalActivityLevel,levels = c('Low','Medium','High'))
health_data_clean$DietQuality = factor(health_data_clean$DietQuality,levels = c('Average','Good','Poor'))
health_data_clean$MedicationAdherence = factor(health_data_clean$MedicationAdherence,levels = c('Medium','High','Low'))
health_data_clean$Outcome = factor(health_data_clean$Outcome,levels = c('Healthy','Critical','At Risk'))

library(fastDummies)

health_data_lm = dummy_cols(health_data_clean,select_columns = c("Gender","SmokingStatus","AlcoholConsumption","ExerciseFrequency","Diabetes","HeartDisease","PhysicalActivityLevel","DietQuality","MedicationAdherence","Outcome"))
health_data_lm = subset(health_data_lm, select=-c(Gender,SmokingStatus,AlcoholConsumption,ExerciseFrequency,Diabetes,HeartDisease,PhysicalActivityLevel,DietQuality,MedicationAdherence,Outcome,PatientID))
names(health_data_lm)[names(health_data_lm) == 'Outcome_At Risk'] <- 'Outcome_at_risk'


Heart Disease Log Regression

In [None]:
health_data_log_reg = dummy_cols(health_data,select_columns = c("Gender","SmokingStatus","AlcoholConsumption","ExerciseFrequency","Diabetes","HeartDisease","PhysicalActivityLevel","DietQuality","MedicationAdherence","Outcome"))
names(health_data_log_reg)[names(health_data_log_reg) == 'Outcome_At Risk'] <- 'Outcome_at_risk'
health_data_log_reg_heart_disease <- subset(health_data_log_reg, select = -c(HealthcareCost,HeartDisease_No,HeartDisease_Yes,PatientID,Gender,SmokingStatus,AlcoholConsumption,ExerciseFrequency,PhysicalActivityLevel,PhysicalActivityLevel,DietQuality,MedicationAdherence,Gender_Other,Diabetes,MedicationAdherence_Medium,ExerciseFrequency_Sometimes,AlcoholConsumption_Regularly,SmokingStatus_Never))
health_data_log_reg_heart_disease$HeartDisease <- as.factor(health_data_log_reg_heart_disease$HeartDisease)

In [None]:
rand_forest_log_heart_disease <- ranger(
  HeartDisease ~ ., 
  data = health_data_log_reg_heart_disease, mtry = 3, 
  num.trees = 150, 
  probability = TRUE, 
  importance = 'impurity'
)
print(rand_forest_log_heart_disease)

In [None]:
importance_scores_heart_disease = rand_forest_log_heart_disease$variable.importance
important_vars_df_log_heart_disease = data.frame(Variables = names(importance_scores_heart_disease),Score= importance_scores_heart_disease)

ggplot(important_vars_df_log_heart_disease, aes(x = reorder(Variables, Score), y = Score)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance - LG Heart Disease",
       x = "Feature",
       y = "Importance") +
  theme_minimal()

In [None]:
predictions_heart_disease <- predict(rand_forest_log_heart_disease, 
                                    data = dplyr::select(health_data_log_reg_heart_disease, -HeartDisease))$predictions
predictions_heart_disease <- as.data.frame(predictions_heart_disease)

predictions_heart_disease <- predictions_heart_disease %>%
  mutate(Pred = ifelse(No > 0.8, 0, 1))

actual_heart_disease <- ifelse(health_data_log_reg_heart_disease$HeartDisease == "No", 0, 1)


actual_factor_heart_disease <- factor(actual_heart_disease)

conf_matrix <- confusionMatrix(as.factor(predictions_heart_disease$Pred), actual_factor_heart_disease)
print(conf_matrix)

Diabetes Log Regression

In [None]:
health_data_log_reg = dummy_cols(health_data,select_columns = c("Gender","SmokingStatus","AlcoholConsumption","ExerciseFrequency","Diabetes","HeartDisease","PhysicalActivityLevel","DietQuality","MedicationAdherence","Outcome"))
names(health_data_log_reg)[names(health_data_log_reg) == 'Outcome_At Risk'] <- 'Outcome_at_risk'
health_data_log_reg_heart_diabetes <- subset(health_data_log_reg, select = -c(HealthcareCost,Diabetes_No,Diabetes_Yes,PatientID,Gender,SmokingStatus,AlcoholConsumption,ExerciseFrequency,PhysicalActivityLevel,PhysicalActivityLevel,DietQuality,MedicationAdherence,Gender_Other,MedicationAdherence_Medium,ExerciseFrequency_Sometimes,AlcoholConsumption_Regularly,SmokingStatus_Never))
health_data_log_reg_heart_diabetes$Diabetes <- as.factor(health_data_log_reg_heart_diabetes$Diabetes)

In [None]:
rand_forest_log_heart_diabetes <- ranger(
  Diabetes ~ ., 
  data = health_data_log_reg_heart_diabetes, mtry = 3, 
  num.trees = 100, 
  probability = TRUE, 
  importance = 'impurity'
)
print(rand_forest_log_heart_diabetes)

In [None]:
importance_scores_diabetes = rand_forest_log_heart_diabetes$variable.importance
important_vars_df_log_diabetes = data.frame(Variables = names(importance_scores_diabetes),Score= importance_scores_diabetes)

ggplot(important_vars_df_log_diabetes, aes(x = reorder(Variables, Score), y = Score)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance - LG Diabetes",
       x = "Feature",
       y = "Importance") +
  theme_minimal()

In [None]:
predictions_diabetes <- predict(rand_forest_log_heart_diabetes, 
                                    data = dplyr::select(health_data_log_reg_heart_diabetes),type="prob")$predictions
predictions_diabetes <- as.data.frame(predictions_diabetes)

predictions_heart_diabetes <- predictions_diabetes %>%
  mutate(Pred = ifelse(No > 0.8, 0, 1))

actual_heart_diabetes<- ifelse(health_data_log_reg_heart_diabetes$Diabetes == "No", 0, 1)

In [None]:
actual_factor_diabetes <- factor(actual_heart_diabetes)

conf_matrix_diabetes <- confusionMatrix(as.factor(predictions_heart_diabetes$Pred), actual_factor_diabetes)
print(conf_matrix_diabetes)