# Web Scrapping of The Great Cources Plus with R

## Loading required packages

In [2]:
library(httr)
library(dplyr)
library(xml2)
library(rvest)
library(RSelenium)
library(stringr)
library(googledrive)

## Entering course information

In [None]:
course_url <- "https://www.thegreatcoursesplus.com/understanding-calculus"
user_email <- "sample@email.com"
user_password <- "sample_password"
#gd <- drive_get("Understanding Calculus: Problems, Solutions and Tips") 

## Getting the lecture informations of the course

In [6]:
page <- read_html(course_url)
num_lec <- page %>%
            html_nodes(xpath = '//*[@id="page-content"]/section/div[3]/div[2]/div/div/div/h2') %>%
            html_text() %>%
            strsplit(x = ., split = " ")
num <- as.numeric(num_lec[[1]][1])

title <- c()
id <- c()
for(i in 1:num){
    film_title <- page %>%
        html_nodes(xpath = paste0('//*[@id="page-content"]/section/div[3]/div[2]/div/div/div/div/a[', i, ']/div[4]/div[1]')) %>%
        html_text()
    title[i] <- film_title
    
    film_id <- page %>%
        html_nodes(xpath = paste0('//*[@id="page-content"]/section/div[3]/div[2]/div/div/div/div/a[', i, ']/div[3]/div')) %>%
        html_attr("data-object-id")
    id[i] <- film_id
}
tbl <- data.frame(Title = title, URL = paste0("https://www.thegreatcoursesplus.com/embed/player?filmId=", id))
nrow(tbl)
tbl

Title,URL
A Preview of Calculus,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca3f-d119-af4d-ca3fbc950000
"Review—Graphs, Models, and Functions",https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca3f-d119-af4d-ca3fe6770000
Review—Functions and Trigonometry,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca40-d119-af4d-ca55125b0000
Finding Limits,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca40-d119-af4d-ca553d940000
An Introduction to Continuity,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca40-d119-af4d-ca5566ed0000
Infinite Limits and Limits at Infinity,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca40-d119-af4d-ca5590160000
The Derivative and the Tangent Line Problem,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca40-d119-af4d-ca55bade0000
Basic Differentiation Rules,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca40-d119-af4d-ca55e70c0000
Product and Quotient Rules,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca41-d119-af4d-ca55164f0000
The Chain Rule,https://www.thegreatcoursesplus.com/embed/player?filmId=0000014c-ca41-d119-af4d-ca55403d0000


## Navigating to the "The Great Course Plus" homepage and login

In [15]:
url <- "https://www.thegreatcoursesplus.com/sign-in"
rD <- rsDriver(verbose = FALSE)

# assign the client to a new variable
remDr <- rD$client
remDr$navigate(url)
email <- remDr$findElement(using="xpath", '//*[@id="modal"]/div/div/div[2]/div[1]/form/p[1]/input')
email$sendKeysToElement(list(user_email))
password <- remDr$findElement(using="xpath", '//*[@id="modal"]/div/div/div[2]/div[1]/form/p[2]/input')
password$sendKeysToElement(list(user_password, key = "enter"))
remDr$getTitle()

## Navigating to the course page and download the course guidebook

In [9]:
remDr$navigate(course_url)
remDr$getTitle()
guidebook_info <- remDr$findElement(using='xpath', '//*[@id="page-content"]/section/div[1]/div/div[2]/div/div[2]/span[1]/a')
book_url <- guidebook_info$getElementAttribute("href")
(book_title <- str_extract(string = book_url, pattern = "[0-9]+.*"))
(course_id <- gsub(pattern = "[%_]", replacement = "", x = str_extract(string = book_title, pattern = "[0-9]*[%_]")))

In [9]:
download.file(url = book_url[[1]], destfile = book_title, method = "curl")
drive_upload(media = book_title, path = paste0(gd$path, book_title))
file.remove(book_title)

Local file:
  * 1007%20Calculus.pdf
uploaded into Drive file:
  * 1007%20Calculus.pdf: 1xKMI7c_2fabR4cBTfbbigjavwnj56LGw
with MIME type:
  * application/pdf


## Navigating to the lecture video and download the video and subtitle

In [7]:
setwd("D:/TTL")

In [17]:
for(i in 26:36){
    lecture_url <- tbl[i,2]
    remDr$navigate(lecture_url)
    remDr$getTitle()
    
    film_info <- remDr$findElement(using="xpath", '//*[@id="snagHtml5Player"]/div[2]/video')
    film_url <- film_info$getElementAttribute("src")
    old_url <- gsub(pattern = "\\?.*", replacement = "", x = film_url[[1]])
    print(old_url)
    new_url <- gsub(pattern = "[0-9]*kbps", replacement = "9216kbps", x = old_url)
    new_url <- gsub(pattern = "https://", replacement = "http://", x = new_url)
    print(new_url)
    sub_url <- paste0("http://vtgcmp4-viewlift.akamaized.net/Captions/SRT/",course_id,"_SRT/",course_id,"_",str_pad(i,2,pad=0),".srt")
    print(sub_url)
    new_title <- gsub(pattern = "[:?/]", replacement = "", x = tbl[i,1])
    download.file(new_url, destfile = paste0(i,". ", new_title,".mp4"), method = "curl")
    download.file(sub_url, destfile = paste0(course_id,"_",str_pad(i,2,pad=0),".srt"))
    
    #mp4 <- list.files(pattern = ".mp4")
    #srt <- list.files(pattern = ".srt")
    #drive_upload(media = mp4, path = paste0(gd$path, mp4))
    #drive_upload(media = srt, path = paste0(gd$path, srt))
    #file.remove(mp4)
    #file.remove(srt)
}

[1] "https://vtgcmp4-snagfilms.akamaized.net/video_assets/2015/mp4/understanding-calculus/exponential-function/1148423545001_3696453012001_18170-anon-eastbaymedia-drm-courses-1007-m4v-TGC-1007-Lect26-HighSchoolLevelCalculus.mp4"
[1] "http://vtgcmp4-snagfilms.akamaized.net/video_assets/2015/mp4/understanding-calculus/exponential-function/1148423545001_3696453012001_18170-anon-eastbaymedia-drm-courses-1007-m4v-TGC-1007-Lect26-HighSchoolLevelCalculus.mp4"
[1] "http://vtgcmp4-viewlift.akamaized.net/Captions/SRT/1007_SRT/1007_26.srt"
[1] "https://vtgcmp4-snagfilms.akamaized.net/video_assets/2015/mp4/understanding-calculus/bases-other-than-em-e-em/1148423545001_3696452989001_18170-anon-eastbaymedia-drm-courses-1007-m4v-TGC-1007-Lect27-HighSchoolLevelCalculus.mp4"
[1] "http://vtgcmp4-snagfilms.akamaized.net/video_assets/2015/mp4/understanding-calculus/bases-other-than-em-e-em/1148423545001_3696452989001_18170-anon-eastbaymedia-drm-courses-1007-m4v-TGC-1007-Lect27-HighSchoolLevelCalculus.mp4"
