In [1]:
library(ggplot2)
library(tidyverse)
library(MASS)
library(janitor)
library(dplyr)
library(pROC)

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.4     [32mv[39m [34mreadr    [39m 2.1.5
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.0
[32mv[39m [34mlubridate[39m 1.9.3     [32mv[39m [34mtibble   [39m 3.2.1
[32mv[39m [34mpurrr    [39m 1.0.2     [32mv[39m [34mtidyr    [39m 1.3.1
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: 'MASS'


The following object is masked from 'package:dplyr':

    select



Attaching package: 'janitor'


The following objects are masked from 'package:stats':

    chisq.test, fisher.test


In [2]:
cleaned_data <- read.csv(file = "../data/cleaned_data.csv")
glimpse(cleaned_data|>clean_names())

Rows: 253,680
Columns: 22
$ diabetes_012           [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0,~
$ high_bp                [3m[90m<int>[39m[23m 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,~
$ high_chol              [3m[90m<int>[39m[23m 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,~
$ chol_check             [3m[90m<int>[39m[23m 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
$ bmi                    [3m[90m<int>[39m[23m 40, 25, 28, 27, 24, 25, 30, 25, 30, 24, 25, 34,~
$ smoker                 [3m[90m<int>[39m[23m 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,~
$ stroke                 [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,~
$ heart_diseaseor_attack [3m[90m<int>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,~
$ phys_activity          [3m[90m<int>[39m[23m 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,~
$ fruits                 [3m[90m<int>[39m[23m 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 

In [3]:
process_data <- function(data) {
  # Clean column names
  data <- clean_names(data)
  
  # Hàm để phân loại BMI
  categorize_bmi <- function(bmi) {
    if (bmi < 18.5) {
      return('Underweight')
    } else if (bmi >= 18.5 & bmi < 24.9) {
      return('Normal weight')
    } else if (bmi >= 25 & bmi < 29.9) {
      return('Overweight')
    } else if (bmi >= 30 & bmi < 34.9) {
      return('Obesity class 1')
    } else if (bmi >= 35 & bmi < 39.9) {
      return('Obesity class 2')
    } else {
      return('Obesity class 3')
    }
  }

  categorize_phys <- function(phys) {
    if (phys == 0) {
      return('0 day')
    } else if (phys <= 7) {
      return('few days')
    } else if (phys <= 30) {
      return('many days')
    }
  }
  
  # Tạo cột mới bmi_category và phys_category cho dataframe data
  data$bmi_category <- sapply(data$bmi, categorize_bmi)
  data$phys_category <- sapply(data$phys_hlth, categorize_phys)
  # Chuyển đổi bmi_category thành factor với các levels cụ thể
  data$bmi_category <- factor(data$bmi_category, levels = c('Underweight', 'Normal weight', 'Overweight', 'Obesity class 1', 'Obesity class 2', 'Obesity class 3'))
  data$phys_category <- factor(data$phys_category, levels = c("0 day", "few days", "many days"))

  # Xóa các cột không cần thiết
  data <- within(data, rm("bmi", "ment_hlth", "phys_hlth"))

  data <- data |> mutate(across(everything(), as.factor))
  return(data)
}

Your code contains a unicode char which cannot be displayed in your
current locale and R will silently convert it to an escaped form when the
R kernel executes this code. This can lead to subtle errors if you use
such chars to do comparisons. For more information, please see
https://github.com/IRkernel/repr/wiki/Problems-with-unicode-on-windows

In [4]:
cleaned_data <- process_data(cleaned_data)
glimpse(cleaned_data)

Rows: 253,680
Columns: 21
$ diabetes_012           [3m[90m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0,~
$ high_bp                [3m[90m<fct>[39m[23m 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,~
$ high_chol              [3m[90m<fct>[39m[23m 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,~
$ chol_check             [3m[90m<fct>[39m[23m 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
$ smoker                 [3m[90m<fct>[39m[23m 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,~
$ stroke                 [3m[90m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,~
$ heart_diseaseor_attack [3m[90m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,~
$ phys_activity          [3m[90m<fct>[39m[23m 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,~
$ fruits                 [3m[90m<fct>[39m[23m 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,~
$ veggies                [3m[90m<fct>[39m[23m 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 

In [5]:
# Hàm để chia train test theo tỷ lệ
train_test_split <- function(data, train_ratio = 0.8) {
        set.seed(123)  # Đặt seed để đảm bảo tính tái lập

        # Randomly shuffle the data
        shuffled_indices <- sample(seq_len(nrow(data)))

        # Determine the number of training samples
        train_size <- floor(train_ratio * nrow(data))

        # Split the data into training and testing sets
        train_indices <- shuffled_indices[1:train_size]
        test_indices <- shuffled_indices[(train_size + 1):nrow(data)]

        train_data <- data[train_indices, ]
        test_data <- data[test_indices, ]

        res <- list(
                train_data = train_data,
                test_data = test_data
        )
        return(res)
}

Your code contains a unicode char which cannot be displayed in your
current locale and R will silently convert it to an escaped form when the
R kernel executes this code. This can lead to subtle errors if you use
such chars to do comparisons. For more information, please see
https://github.com/IRkernel/repr/wiki/Problems-with-unicode-on-windows

In [9]:
split_cleaned_data <- train_test_split(cleaned_data)

print(table(split_cleaned_data$train_data$diabetes_012))
print(table(split_cleaned_data$test_data$diabetes_012))


     0      1      2 
170989   3676  28279 

    0     1     2 
42714   955  7067 


In [10]:
write.csv(split_cleaned_data$train_data, file = "../data/train_data.csv", row.names = FALSE)
write.csv(split_cleaned_data$test_data, file = "../data/test_data.csv", row.names = FALSE)