<a href="https://colab.research.google.com/github/yyRegis/projeto-idc/blob/main/ScriptIDC_KNN_no_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Abrir colab com R

Link: https://colab.research.google.com/#create=true&language=r

Atualizado em: 01.2025

### KNN


In [2]:
# <> ----------------------------
# <> Instalando bibliotecas
install.packages("tidymodels")
install.packages("kknn")


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘shape’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘diagram’, ‘lava’, ‘prodlim’, ‘warp’, ‘future.apply’, ‘iterators’, ‘listenv’, ‘parallelly’, ‘lhs’, ‘DiceDesign’, ‘sfd’, ‘sparsevctrs’, ‘patchwork’, ‘globals’, ‘clock’, ‘gower’, ‘ipred’, ‘timeDate’, ‘furrr’, ‘slider’, ‘doFuture’, ‘foreach’, ‘future’, ‘GPfit’, ‘modelenv’, ‘dials’, ‘hardhat’, ‘infer’, ‘modeldata’, ‘parsnip’, ‘recipes’, ‘rsample’, ‘tune’, ‘workflows’, ‘workflowsets’, ‘yardstick’


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘igraph’




In [3]:
# <> ----------------------------
# Lendo bibliotecas
library(tidymodels)
library(kknn)


── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.3.0 ──

[32m✔[39m [34mbroom       [39m 1.0.7     [32m✔[39m [34mrecipes     [39m 1.2.0
[32m✔[39m [34mdials       [39m 1.4.0     [32m✔[39m [34mrsample     [39m 1.2.1
[32m✔[39m [34mdplyr       [39m 1.1.4     [32m✔[39m [34mtibble      [39m 3.2.1
[32m✔[39m [34mggplot2     [39m 3.5.1     [32m✔[39m [34mtidyr       [39m 1.3.1
[32m✔[39m [34minfer       [39m 1.0.7     [32m✔[39m [34mtune        [39m 1.3.0
[32m✔[39m [34mmodeldata   [39m 1.4.0     [32m✔[39m [34mworkflows   [39m 1.2.0
[32m✔[39m [34mparsnip     [39m 1.3.1     [32m✔[39m [34mworkflowsets[39m 1.1.0
[32m✔[39m [34mpurrr       [39m 1.0.4     [32m✔[39m [34myardstick   [39m 1.3.2

── [1mConflicts[22m ───────────────────────────────────────── tidymodels_conflicts() ──
[31m✖[39m [34mpurrr[39m::[32mdiscard()[39m masks [34mscales[39m::discard()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m

In [4]:
data <- "/content/student_admission_record_dirty.csv"

In [5]:
df <- read.csv(data, stringsAsFactors = FALSE)

In [6]:
# Remover colunas irrelevantes
df <- df[, !(names(df) %in% c("Name", "City"))]

# Corrigir valores negativos
df$Age[df$Age < 0] <- NA
df$Admission.Test.Score[df$Admission.Test.Score < 0] <- NA
df$High.School.Percentage[df$High.School.Percentage < 0] <- NA

# Remover outliers extremos
percentile_99 <- apply(df[, c("Admission.Test.Score", "High.School.Percentage")], 2, quantile, 0.99, na.rm = TRUE)
df <- df[df$Admission.Test.Score <= percentile_99["Admission.Test.Score"] &
         df$High.School.Percentage <= percentile_99["High.School.Percentage"], ]

# Preencher valores ausentes com mediana (para números) e moda (para categorias)
df$Age[is.na(df$Age)] <- median(df$Age, na.rm = TRUE)
df$Admission.Test.Score[is.na(df$Admission.Test.Score)] <- median(df$Admission.Test.Score, na.rm = TRUE)
df$High.School.Percentage[is.na(df$High.School.Percentage)] <- median(df$High.School.Percentage, na.rm = TRUE)

# Remover valores ausentes em 'Admission Status'
df <- df[!is.na(df$Admission.Status), ]

# Converter variáveis categóricas para fator
df$Gender <- as.factor(df$Gender)
df$Admission.Status <- as.factor(df$Admission.Status)


In [7]:
# <> ----------------------------
# <> Para conflitos de funções, usar a do tidymodels
tidymodels_prefer()

In [8]:
# <> ----------------------------
# <> Selecionando Amostra de Treinamento e Teste
set.seed(123)
cell_split <- initial_split(df, strata = Admission.Status, prop = 0.7)

In [9]:
# <> Dividindo em treinamento e teste
data_train <- training(cell_split)
data_test  <- testing(cell_split)

In [44]:
# <> ----------------------------
# <> Definir o modelo kNN
knn_spec <- nearest_neighbor(mode = "classification", neighbors = 5) |>
  set_engine("kknn")

In [45]:
# <> ----------------------------
# <> Treinamento do modelo
set.seed(1)
knn_fit <- knn_spec |>
  fit(Admission.Status ~ ., data = data_train)

In [46]:
# <> ----------------------------
# <> Fazendo previsão no conjunto de teste
dt_testing_pred <- predict(knn_fit, data_test) |>
  bind_cols(predict(knn_fit, data_test, type = "prob")) |>
  bind_cols(data_test |> select(Admission.Status))


In [47]:
# <> ----------------------------
# <> Avaliação do modelo
dt_testing_pred |>
  accuracy(truth = Admission.Status, .pred_class)



.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,multiclass,0.55


--------

## Aplicando k-fold

In [49]:
# <> ----------------------------
# <> Definindo o modelo kNN
knn_spec <-
  nearest_neighbor(neighbors = 5) |> # Define k=5 (ajustável)
  set_mode("classification") |>
  set_engine("kknn")

In [50]:
# <> ----------------------------
# <> Criando o workflow de treinamento
knn_wf <-
  workflow() |>
  add_model(knn_spec) |>
  add_formula(Admission.Status ~ .)

In [51]:
# <> ----------------------------
# <> Treinamento com reamostragem k-fold
set.seed(345)
folds <- vfold_cv(data_train, v = 5, strata = Admission.Status)


In [52]:
# <> ----------------------------
# <> Treinando o modelo final
set.seed(1)
knn_fit <-
  knn_wf |>
  fit(data = data_train)

In [53]:
# <> ----------------------------
# <> Fazendo a predição na amostra de teste
knn_testing_pred <-
  predict(knn_fit, data_test) |>
  bind_cols(predict(knn_fit, data_test, type = "prob")) |>
  bind_cols(data_test |> select(Admission.Status))

In [54]:
# <> ----------------------------
# <> Avaliando a acurácia do modelo
knn_testing_pred |>
  accuracy(truth = Admission.Status, .pred_class)

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,multiclass,0.55
