/
newyorkparolescraper.Rmd
264 lines (188 loc) · 8.4 KB
/
newyorkparolescraper.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
---
title: "NY Parole Board Scraper"
output: html_notebook
author: daniel robert turner
date: 03-03-2023
---
This script retrieves data from 'www.publicapps.doccs.ny.gov/ParoleBoardCalendar/'.
V1 Was pure rvest but couldn't access DIN-level information.
V2 Uses Selenium to spoof a browser and scrape by clicking on elements directly.
V3 Scrapes additional information from the DIN page and improves error handling.
V4 Adds columns to help locate missing data.
## General Setup
```{r setup}
library( tidyverse )
library( rvest )
library( RSelenium )
library( tictoc )
```
# Version 1--Using rvest to submit the form directly
Retracing some of the steps using rvest since the data extraction will be similar
```{r}
# Start the scraping session by getting the HTML
url_base = 'https://publicapps.doccs.ny.gov/ParoleBoardCalendar/Default'
sesh = session( url_base )
# Currently, just trying to scrape all the results for one month
# This could be situated in a loop to get all combinations later without much difficulty
vals <- list('ctl00$MainContent$ddlMonth' = "01", 'ctl00$MainContent$ddlYear' = "2022")
# Select and modify the form to the desired year/month
submit.to.calendar = sesh %>% read_html() %>% html_form() %>% last() %>% html_form_set(!!!vals)
# Submit the html form to get the calendar response
calendar.response = html_form_submit( submit.to.calendar )
# Check the response
print(calendar.response %>% read_html())
## Extract calendar table (list of inmates) from calendar response
# A
calendar.table = calendar.response %>%
read_html() %>%
html_nodes('table#MainContent_manyResultsTable') %>%
html_table() %>%
as.data.frame()
# Peek at the calendar table
sample_n( calendar.table, 12 )
```
# Scraping with Selenium
## Bug check
Note there is a common bug in rsDriver() that can be circumvented by installing the latest version of JDK:
https://www.oracle.com/java/technologies/downloads/
```{r include=FALSE}
# Launch a Firefox instance, enter as a client
rD <- rsDriver(browser = "firefox", chromever = NULL)
remDr <- rD[["client"]]
```
## Scraper chunk
```{r include=FALSE}
# What year to scrape?
the.year = "2022"
# For testing
the.letter = "G"
the.month = "October"
# Loop month by month, then letter by letter, then DIN by DIN...
tic("scraping")
# Create the Tank, which we will populate with inmates with DIN-level data
pbTank = data.frame()
# Create the Stack, which will populate with inmates with any data
pbStack = data.frame()
# Loop the months
for( the.month in month.name ){
# Common starting point
remDr$navigate( url_base )
# Set the month/year
remDr$findElement(using = "id", value = "MainContent_ddlMonth")$sendKeysToElement(list(the.month))
remDr$findElement(using = "id", value = "MainContent_ddlMonth")$sendKeysToElement(list(the.month)) # turns out to help in a lot of cases to select the month twice.
remDr$findElement(using = "id", value = "MainContent_ddlYear")$sendKeysToElement(list(the.year))
remDr$findElement(using = "id", value = "MainContent_ddlYear")$sendKeysToElement(list(the.year)) # just to be safe
# Submit the form by selecting the button and clicking
webElem <- remDr$findElement(using = 'xpath', '//*[@id="MainContent_btnSubmit"]')
webElem$clickElement()
# Get the month/year from the page
letterMonth <- remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('div#MainContent_manyResultsDiv > h2') %>%
html_text() %>%
str_remove("\n") %>%
trimws()
# From Z to A, loop through the alphabet to get results table
for( the.letter in rev( LETTERS ) ){ # We go in reverse alphabetical order so load each result the same way
# Navigate to the letter and click on it
webElem <- remDr$findElement(using = 'link text', the.letter)
webElem$clickElement()
# Show a message since this takes a while
message(paste("Processing:", letterMonth, the.letter))
# Parse the table in the result
letterTable = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('table#MainContent_manyResultsTable') %>%
html_table() %>%
as.data.frame()
# Add the letterTable to the Stack
pbStack = bind_rows( pbStack, letterTable )
# Adding the attempted and retrieved months to the output
letterTable$stated_month = letterMonth
letterTable$queried_month = paste(the.month, the.year)
letterTable$expected_month = letterTable$stated_month == letterTable$queried_month
# Are there inmates with DINs to scrape?
letterEmpty = any( str_detect( letterTable, "No inmate interviews exist for this name/month.") )
# If there are DINs to scrape, go to work...
if(!letterEmpty){
# Collect the DINS
the.dins = letterTable$DIN
# Loop the DINS, clicking and scraping as we go
for( the.din in the.dins ){
# Navigate to, and click on the din link
webElem <- remDr$findElement(using = 'link text', the.din)
webElem$clickElement()
# Show a message when we scrape a DIN page
message("Retrieving additional data for: ", the.din)
# Extract the top table
dinTable = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('table#MainContent_paroleeInformation') %>%
html_table() %>%
as.data.frame()
# In rare cases there's no data on the DIN page
# in which case we would skip the following and just go back to the letter page
if( length(dinTable) != 0 ){
# Pivot the table to make the first row colnames and the second row values
dinTable = dinTable %>%
rownames_to_column() %>%
pivot_longer(-rowname) %>%
pivot_wider(names_from=rowname, values_from=value)
# Add colnames from the first row; remove colons as we go
names( dinTable ) = as.character(unlist( dinTable[1,])) %>%str_replace_all( ":", "" )
# Subset out the first row
dinTable = dinTable[-1,]
# Delete any missing or NA fields
dinTable = dinTable %>%
select_if(~!(all(is.na(.)) | all(. == "")))
# Extract the bottom table, consisting of convictions
conTable = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_nodes('table#MainContent_offenseInformationTable') %>%
html_table() %>%
as.data.frame( )
# Append the second table data as lists in string format (CSV friendly, much easier than relational databases)
dinTable$Crimes_Names = paste(conTable$Crime.of.conviction, sep = "", collapse = ',' )
dinTable$Crimes_Class = paste(conTable$Class, sep = "", collapse = ',' )
dinTable$Crimes_County = paste(conTable$County.of.commitment, sep = "", collapse = ',' )
# Clean up the output
dinTable = dinTable %>%
select(-X1)
# Join the letter and DIN-level data to the Tank
tankRow = merge( letterTable, dinTable, by="DIN" )
tankRow$dinParsed = TRUE
} else { # no parse-able DIN-level data
# Join the letter-level data to Tank
tankRow = letterTable %>% subset(DIN == the.din)
tankRow$dinParsed = FALSE
} # end empty DIN conditional
# Add tankRow to Tank
pbTank = bind_rows( pbTank, tankRow )
# Go back to letters
remDr$goBack()
} # end din loop
} # end empty letter conditional
} # end letter loop
} # end month loop
# end timing
toc()
# Set the paths accordingly.
# I think there here package might work for both of us.
# Write results into CSVs
write_csv( pbTank, paste0(here::here(), "/Desktop/nypb_", the.year, "_results.csv"))
# write_csv( pbStack, "/Users/dt/Exports/nyscraper-2022.csv") # not used
# Check which DINs were not shared
DINerr = pbTank %>% filter(dinParsed == FALSE)
write_csv( DINerr, paste0(here::here(), "/Desktop/nypb_", the.year, "_DINerr.csv"))
```
# Wrap up
```{r}
remDr$close()
# stop the selenium server
rD[["server"]]$stop()
```
# Check out the data
```{r}
nrow(unique(pbTank))
table( pbTank$stated_month )
```