Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
132 lines (86 sloc) 1.98 KB
---
title: "scrap_r_mailist"
author: "TC"
date: "6/14/2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(xml2)
library(dplyr)
```
## Background
https://d.cosx.org/d/420739-r
## Download all thread html
```{r}
thread.l = readLines("https://stat.ethz.ch/pipermail/r-devel/") %>%
grep("thread.html",.,value= TRUE) %>%
gsub(".*?([19|20].*html).*","https://stat.ethz.ch/pipermail/r-devel/\\1", .)
outf = paste0("./tmp/", basename(dirname(thread.l)),".html")
dir.create("./tmp/")
```
```{r download, eval=FALSE}
for(i in 1:length(thread.l)){
if(!file.exists(outf[i]))
download_html(thread.l[i], file = outf[i])
}
```
## Data
```{r}
thread.l.local = list.files("./tmp/",full.names=TRUE)
```
## Scrape it
```{r}
scrap_thread_loc = function(file_thread) {
year_mon = basename(file_thread) %>% gsub(".html","",.)
dir = paste0("https://stat.ethz.ch/pipermail/r-devel/",year_mon,"/")
tmpl =
read_html(file_thread) %>%
xml_find_all("//body/ul[2]/li")
title =
tmpl %>%
xml_find_first(".//a") %>%
xml_contents() %>%
as.character()
link =
tmpl %>%
xml_find_first(".//a") %>%
xml_attr("href") %>%
paste0(dir, "/", .)
reps =
sapply(tmpl, function(node) {
return(node %>% xml_find_all(".//li") %>% length())
})
df.rslt = tibble(title,
link,
reps
)
df.rslt$year_mon = year_mon
return(df.rslt)
}
```
## scrape all
```{r}
all.rslt = pbapply::pblapply(thread.l.local, scrap_thread_loc)
```
```{r}
all.rslt.df = do.call(rbind,all.rslt)
```
## clean
```{r}
all.rslt.df %>%
arrange(desc(reps)) %>%
slice(1:10) %>%
knitr::kable()
```
## Combine threads with the same title
```{r}
all.rslt.df %>%
group_by(title) %>%
summarise(link =max(link),
reps = sum(reps),
year_mon = max(year_mon)) %>%
arrange(desc(reps)) %>%
slice(1:10) %>%
knitr::kable()
```
You can’t perform that action at this time.