# Predicting online learning performance using LMS data

*Sample lecture for UVA SDS (Introduction to Data Science - using R)*

---

**Roadmap:**

0. setup
1. import data
2. process data
3. explore data

    *-- what we already know --*

4. partition data
5. train the model using training data
6. assess predictive accuracy using test data

    *-- what we will work on today and in the lab session --*

7. tune the model

    *-- what we will work on in the next 4 weeks and in our data science careers --*

---

========================
### PART 0: Setup
========================

In [None]:
# set working directory
setwd("~/Dropbox (Brown)/SDS/data/")

In [None]:
# load the packages

## includes the packages that you’re likely to use in everyday data analyses
library(tidyverse)
library(tidylog)

## Classification And REgression Training
library(caret)

## Data used for today's lecture - comment the the installation after running it for the first time

    ### install devtools
    install.packages("devtools", repos = "http://cran.us.r-project.org")
    ### install the dataedu package
    devtools::install_github("data-edu/dataedu")

library(dataedu)

========================
### Step 1: Import data
========================

In [None]:
# clear workspace
ls()

rm(list=ls())

In [None]:
# get the data
data <- dataedu::sci_mo_with_text

In [None]:
# view the data table
View(data)

In [None]:
# check data structure
str(data)

In [None]:
# basic summary
summary(data)

========================
### Step 2: Process data
========================

In [None]:
# select only the variables we are going to use
data_select <-
            data %>%
            select(
                int,
                uv,
                pc,
                time_spent,
                final_grade,
                subject,
                enrollment_reason,
                semester,
                cogproc,
                social,
                posemo,
                negemo,
                n
            )

In [None]:
# view the new data
View(data_select)

In [None]:
# calling the na.omit function to eliminate ANY rows that have ANY missing data
data_select <- na.omit(data_select)

    ## note: it is a practical problem how to deal with missing values

In [None]:
# checking whether our na.omit call worked as expected
# after running the code above, we see that we now have 464 rows
nrow(data_select)

In [None]:
# view the new data and open a output panel
View(data_select)

In [None]:
# convert the text (character) variables into factors
data_select <- 
            data_select %>% 
            mutate_if(is.character, as.factor)

In [None]:
# check the outcome variable
summary(data_select$final_grade)

# we will use the continous variable for the regression tasks in Week 9

In [None]:
# create a dummy indicator = 1 if final_grade >= 85 (GPA >= B)
data_select$gpaB[data_select$final_grade >= 85] <- 1
data_select$gpaB[data_select$final_grade < 85]  <- 0

table(data_select$gpaB)

prop.table(table(data_select$gpaB))

# convert the outcome to a factor
data_select$gpaB <- as.factor(data_select$gpaB)

In [None]:
# label variables

data_select$gpaB <- factor(data_select$gpaB,
                    levels = c(0, 1),
                    labels = c("GPA < B", "GPA >= B"))

In [None]:
# remove 'final_grade'
data_select <-
            data_select %>%
            select(
                -final_grade
            )

========================
### Step 3: Explore data
========================

In [None]:
# check the key X variable: number of online posts
summary(data_select$n)

In [None]:
# rename the variable 'n' to 'num_posts'
data_select <- 
    data_select %>% 
            rename(num_posts=n)

In [None]:
# plot a histogram distribution of 'n'
ggplot(data_select, aes(x=num_posts)) +
    geom_histogram(position="identity", bins = 40, fill="#69b3a2")

In [None]:
# create a dummy indicator = 1 if n>=median and check the distribution
data_select$post_above_median[data_select$num_posts >= 21] <- 1
data_select$post_above_median[data_select$num_posts < 21]  <- 0

table(data_select$post_above_median)

prop.table(table(data_select$post_above_median))

In [None]:
# plot the distribution of outcome by posts >= median
ggplot(data_select, aes(x=final_grade, fill=as.factor(post_above_median))) +
    geom_histogram(bins=20, position="dodge")

In [None]:
# distribution of outcome by posts
ggplot(data_select, aes(x=num_posts, y=final_grade)) + 
  geom_point(color="cyan3")

In [None]:
# distribution of outcome by posts + linear fitted line
ggplot(data_select, aes(x=n, y=final_grade)) + 
  geom_point(color="cyan3") +
  geom_smooth(formula = y ~ x, method=lm, color="red", se=FALSE)

In [None]:
# convert the outcome to a factor
data_select$gpaB <- as.factor(data_select$gpaB)

In [None]:
# label variables

data_select$gpaB <- factor(data_select$gpaB,
                    levels = c(0, 1),
                    labels = c("GPA < B", "GPA >= B"))

In [None]:
# remove 'final_grade'
data_select <-
            data_select %>%
            select(
                -final_grade
            )

### Step 4: Partition data (train & test)

In [None]:
# set a seed to ensure the reproducibility of data partition
set.seed(20210524)

In [None]:
# split sample (80% training, 20% test)
split = 0.8
trainIndex <- createDataPartition(data_select$gpaB, p = split, list = FALSE)

In [None]:
# train sample
train <- data_select[trainIndex, ] 

In [None]:
View(train)

In [None]:
# test sample
test  <- data_select[-trainIndex, ]

In [None]:
View(test)

### Step 5: train the model

In [None]:
# setting a seed for reproducibility of the model
set.seed(20210524)

In [None]:
# train the decision tree model
tree <- train(gpaB ~ .,
              data = train,
              method = "rpart",
              metric="Accuracy",
              tuneLength = 10
              )

In [None]:
# get a summary of the model we just built
tree

In [None]:
# final model
tree$finalModel

In [None]:
# tree graph
library(rpart.plot)

#pdf("./figure/tree.pdf") 

rpart.plot(tree$finalModel)

#dev.off() 

### Step 6: Predictive accuracy on the test data

In [None]:
# setting a seed for reproducibility
set.seed(20210524)

In [None]:
# Create a new object for the testing data including predicted values 
test_augmented <-
    test %>%
    mutate(pred = predict(rf, test),
           obs  = final_grade)

In [None]:
# Transform this new object into a data frame
defaultSummary(as.data.frame(test_augmented))

### *** GRAPH

In [None]:
# setting a seed for reproducibility
set.seed(20210524)

In [None]:
# Specify the same model with the addition of the variable importance metric
rf_imp <- train(final_grade ~ .,
                data = train,
                method = "ranger",
                importance = "permutation")

# Extract the variable importance from this new model
varImp(rf_imp)

In [None]:
# visualize the variable importance
varImp(tree) %>%
    pluck(1) %>%
    rownames_to_column("var") %>%
    ggplot(aes(x = reorder(var, Overall), y = Overall)) +
    geom_col(fill = "cyan4") +
    coord_flip() +
    theme_dataedu()

In [None]:
# tree plot
ctrl <- trainControl(
                     method = "LGOCV", 
                     repeats = 3, 
                     savePred=TRUE,
                     verboseIter = TRUE,
                     preProcOptions = list(thresh = 0.95)
                    )

preProcessInTrain<-c("center", "scale")
metric_used<-"RMSE"

tree <- train(final_grade ~ .,
              data = train,
              method = "rpart",
              trControl = ctrl,
              metric=metric_used,
              tuneLength = 10
              )

In [None]:
library(rpart.plot)
rpart.plot(tree$finalModel)

### Tune the model: grow a larger tree

### Decision tree vs. random forest

next:

- tuning the model

- add a roadmap:
    - big picture -> example -> coding -> summary & theory

In [436]:
print("Well done!")

[1] "Well done!"
