Skip to content

Commit

Permalink
added canonical internal data
Browse files Browse the repository at this point in the history
  • Loading branch information
trinker committed Dec 30, 2017
1 parent 0782eb2 commit b32bb15
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 7 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Expand Up @@ -21,4 +21,5 @@ tools/textclean_logo/resize_icon.txt
inst/staticdocs
inst/extra_statdoc
Thumbs.db
inst/scraping_scripts

15 changes: 8 additions & 7 deletions DESCRIPTION
@@ -1,17 +1,18 @@
Package: textclean
Title: Text Cleaning Tools
Version: 0.6.1
Authors@R: c( person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")), person("ctwheels",
"StackOverflow", role = "ctb") )
Authors@R: c( person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut",
"cre")), person("ctwheels", "StackOverflow", role = "ctb") )
Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
Description: Tools to clean and process text. Tools are geared at checking for substrings that are not optimal for
analysis and replacing or removing them with more analysis friendly substrings. For example, emoticons
are often used in text but not always easily handled by analysis algorithms. The 'replace_emoticon()'
function replaces emoticons with word equivalents.
Description: Tools to clean and process text. Tools are geared at checking for substrings
that are not optimal for analysis and replacing or removing them with more analysis
friendly substrings. For example, emoticons are often used in text but not always
easily handled by analysis algorithms. The 'replace_emoticon()' function replaces
emoticons with word equivalents.
Depends: R (>= 3.2.3)
Imports: english(>= 1.0-2), lexicon (>= 0.7.2), qdapRegex, stringi, textshape(>= 1.0.1), utils
Suggests: testthat
Date: 2017-12-21
Date: 2017-12-30
License: GPL-2
LazyData: TRUE
Roxygen: list(wrap = FALSE)
Expand Down
Binary file added R/sysdata.rda
Binary file not shown.
66 changes: 66 additions & 0 deletions inst/scraping_scripts/google_ngram_to_canonical.R
@@ -0,0 +1,66 @@
pacman::p_load(dplyr, data.table, stringi, R.utils)


#letter <- 'b'

get_google_ngram_data <- function(letter, ...){

loc <- sprintf('http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-%s.gz', letter) %>%
textreadr::download()

R.utils::gunzip(loc, destname = sprintf('google_ngram/googlebooks-eng-all-1gram-20120701-%s', letter), remove = FALSE)

}


make_canonical_hash <- function(letter, ...){

dat <- fread(sprintf('google_ngram/googlebooks-eng-all-1gram-20120701-%s', letter), sep = '\t', header = FALSE)[, 1:3
][V2 == 2008,
][, V1 := tolower(stri_replace_all_regex(V1, '_[A-Z]+$', ''))
][stri_detect_regex(V1, '[^a-z\'-]', negate = TRUE),
][, list(V3 = sum(V3)), by = c('V1')
][, canonical := gsub("([a-z])(\\1+)", '\\1', V1, perl = TRUE)
][, cnt := .N, by = 'canonical'
][cnt > 1,
][, cnt := NULL
#][order(canonical, -V3)
][, .SD[which.max(V3)], by = 'canonical'
][, V3 := NULL
][]

setnames(dat, c('V1'), c('word'))
setkey(dat, 'canonical')

#setnames(dat, c('V1', 'V3'), c('word', 'n'))
#setcolorder(dat, c("canonical", "word", "n"))

dat
}

canonical <- lapply(letters, function(letter){
gc()
print(letter); flush.console()
try(get_google_ngram_data(letter))
try(make_canonical_hash(letter))
})

## check for errors
canonical %>%
sapply(inherits, 'try-error') %>%
sum()


canonical %>%
rbindlist() %>%
unique() %>%
saveRDS('canonical.rds')


canonical <- readRDS('C:\\Users\\Tyler\\Desktop/canonical.rds')

devtools::use_data(canonical, internal = TRUE)




0 comments on commit b32bb15

Please sign in to comment.