In [None]:
# Helper packages
library(tidyverse)
library(rsample) # for data splitting
# Modeling packages
library(caret) # for logistic regression modeling
# Model interpretability packages
library(vip) # variable importance

In [12]:
attrition <- read_csv('data/attrition.csv')
attrition <- mutate(attrition, Attrition = recode(Attrition, `Yes` = 1, `No` =  0))


Parsed with column specification:
cols(
  .default = col_double(),
  Attrition = [31mcol_character()[39m,
  BusinessTravel = [31mcol_character()[39m,
  Department = [31mcol_character()[39m,
  Education = [31mcol_character()[39m,
  EducationField = [31mcol_character()[39m,
  EnvironmentSatisfaction = [31mcol_character()[39m,
  Gender = [31mcol_character()[39m,
  JobInvolvement = [31mcol_character()[39m,
  JobRole = [31mcol_character()[39m,
  JobSatisfaction = [31mcol_character()[39m,
  MaritalStatus = [31mcol_character()[39m,
  OverTime = [31mcol_character()[39m,
  PerformanceRating = [31mcol_character()[39m,
  RelationshipSatisfaction = [31mcol_character()[39m,
  WorkLifeBalance = [31mcol_character()[39m
)

See spec(...) for full column specifications.



In [13]:
df <- attrition %>% mutate_if(is.ordered, factor, ordered = FALSE)
# Create training (70%) and test (30%) sets for the
# rsample::attrition data.

set.seed(123) # for reproducibility
churn_split <- initial_split(df, prop = .7, strata = "Attrition")
churn_train <- training(churn_split)
churn_test <- testing(churn_split)

In [16]:
# Set up binomial/logistic regressions

model1 <- glm(Attrition ~ MonthlyIncome, family = "binomial",
    data = churn_train)

tidy(model1)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-0.9236247934,0.1550905456,-5.955391,2.594512e-09
MonthlyIncome,-0.0001302806,2.64429e-05,-4.926867,8.355852e-07


In [17]:
model2 <- glm(Attrition ~ OverTime, family = "binomial",
    data = churn_train)

tidy(model2)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-2.183802,0.1217895,-17.930951,6.761317e-72
OverTimeYes,1.406394,0.1757048,8.004299,1.20149e-15


In [18]:
# Translate the odds intro probabilities:

exp(coef(model1))
exp(coef(model2))

the odds of an employee attriting in model1 increase multiplicatively by
1 for every one dollar increase in MonthlyIncome, whereas the odds of attriting
in model2 increase multiplicatively by 4.081 for employees that work OverTime
compared to those that do not.

In [19]:
# Get confidence intervals

confint(model1)
confint(model2)


Waiting for profiling to be done...



Unnamed: 0,2.5 %,97.5 %
(Intercept),-1.226775496,-0.6180062
MonthlyIncome,-0.0001849796,-8.107634e-05


Waiting for profiling to be done...



Unnamed: 0,2.5 %,97.5 %
(Intercept),-2.430458,-1.95233
OverTimeYes,1.063246,1.752879


In [21]:
# Multiple logistic regression

model3 <- glm(
Attrition ~ MonthlyIncome + OverTime,
family = "binomial",
data = churn_train
)

tidy(model3)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-1.4296820006,0.1763808,-8.105654,5.246248e-16
MonthlyIncome,-0.0001392218,2.703737e-05,-5.149235,2.615514e-07
OverTimeYes,1.4682839234,0.1799996,8.15715,3.430225e-16


In [22]:
set.seed(123)
cv_model1 <- train(
Attrition ~ MonthlyIncome,
data = churn_train,
method = "glm",
family = "binomial",
trControl = trainControl(method = "cv", number = 10)
)

"You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column."


In [24]:
set.seed(123)
cv_model2 <- train(
Attrition ~ MonthlyIncome + OverTime,
data = churn_train,
    method = "glm",
family = "binomial",
trControl = trainControl(method = "cv", number = 10)
)

"You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column."


In [25]:
set.seed(123)
cv_model3 <- train(
Attrition ~ .,
data = churn_train,
method = "glm",
family = "binomial",
trControl = trainControl(method = "cv", number = 10)
)

"You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column."


In [26]:
# extract out of sample performance measures
summary(
resamples(
list(
model1 = cv_model1,
model2 = cv_model2,
model3 = cv_model3
)
)
)$statistics$Accuracy

NULL

In [28]:
# extract out of sample performance measures
summary(
resamples(
list(
model1 = cv_model1,
model2 = cv_model2,
model3 = cv_model3
)
)
)


Call:
summary.resamples(object = resamples(list(model1 = cv_model1, model2
 = cv_model2, model3 = cv_model3)))

Models: model1, model2, model3 
Number of resamples: 10 

MAE 
            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
model1 0.2061740 0.2562651 0.2610893 0.2630697 0.2774781 0.3084759    0
model2 0.1868568 0.2287106 0.2477165 0.2423774 0.2606637 0.2846597    0
model3 0.1038123 0.1565367 0.1827211 0.1698659 0.1896536 0.1907121    0

RMSE 
            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
model1 0.2669236 0.3571445 0.3593717 0.3605247 0.3842628 0.4279211    0
model2 0.2518288 0.3210373 0.3652731 0.3456446 0.3704597 0.4047592    0
model3 0.1962488 0.2791780 0.3105109 0.2996882 0.3356790 0.3443849    0

Rsquared 
              Min.    1st Qu.     Median       Mean    3rd Qu.      Max. NA's
model1 0.001104525 0.01613462 0.02659892 0.03746085 0.04143442 0.1034247    0
model2 0.002305286 0.07855734 0.11573486 0.12910230 0.13484299 0.3741519  

In [None]:
# predict class
pred_class <- predict(cv_model3, churn_train)

In [38]:
pred_class

In [41]:
# create confusion matrix
confusionMatrix(predicted = pred_class, actual = churn_train$Attrition)

ERROR: Error in is.factor(data): argument "data" is missing, with no default


In [None]:
# predict class
pred_class <- predict(cv_model3, churn_train)
# create confusion matrix
confusionMatrix(
data = relevel

In [None]:
# predict class
pred_class <- predict(cv_model3, churn_train)
# create confusion matrix
confusionMatrix(
data = relevel(pred_class, ref = ”Yes”),
reference = relevel(churn_train$Attrition, ref = ”Yes”)
)

In [47]:
# Plot AUC

library(ROCR)
# Compute predicted probabilities
m1_prob <- predict(cv_model1, churn_train, type = "prob")$Yes
m3_prob <- predict(cv_model3, churn_train, type = "prob")$Yes

# Compute AUC metrics for cv_model1 and cv_model3
perf1 <- prediction(m1_prob, churn_train$Attrition) %>%
performance(measure = "tpr", x.measure = "fpr")
perf2 <- prediction(m3_prob, churn_train$Attrition) %>%
performance(measure = "tpr", x.measure = "fpr")

# Plot ROC curves for cv_model1 and cv_model3
plot(perf1, col = "black", lty = 2)
plot(perf2, add = TRUE, col = "blue")
legend(0.8, 0.2, legend = c("cv_model1", "cv_model3"),
col = c("black", "blue"), lty = 2:1, cex = 0.6)

"package 'ROCR' was built under R version 4.0.3"


ERROR: Error in dimnames(out)[[2]] <- modelFit$obsLevels: length of 'dimnames' [2] not equal to array extent
