added canonical internal data

trinker · Dec 30, 2017 · b32bb15 · b32bb15
1 parent 0782eb2
commit b32bb15
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 7 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -21,4 +21,5 @@ tools/textclean_logo/resize_icon.txt
 inst/staticdocs
 inst/extra_statdoc
 Thumbs.db
+inst/scraping_scripts
 
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,17 +1,18 @@
 Package: textclean
 Title: Text Cleaning Tools
 Version: 0.6.1
-Authors@R: c( person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")), person("ctwheels",
-             "StackOverflow", role = "ctb") )
+Authors@R: c( person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut",
+          "cre")), person("ctwheels", "StackOverflow", role = "ctb") )
 Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
-Description: Tools to clean and process text.  Tools are geared at checking for substrings that are not optimal for
-             analysis and replacing or removing them with more analysis friendly substrings. For example, emoticons
-             are often used in text but not always easily handled by analysis algorithms.  The 'replace_emoticon()'
-             function replaces emoticons with word equivalents.
+Description: Tools to clean and process text.  Tools are geared at checking for substrings
+          that are not optimal for analysis and replacing or removing them with more analysis
+          friendly substrings. For example, emoticons are often used in text but not always
+          easily handled by analysis algorithms.  The 'replace_emoticon()' function replaces
+          emoticons with word equivalents.
 Depends: R (>= 3.2.3)
 Imports: english(>= 1.0-2), lexicon (>= 0.7.2), qdapRegex, stringi, textshape(>= 1.0.1), utils
 Suggests: testthat
-Date: 2017-12-21
+Date: 2017-12-30
 License: GPL-2
 LazyData: TRUE
 Roxygen: list(wrap = FALSE)

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/inst/scraping_scripts/google_ngram_to_canonical.R b/inst/scraping_scripts/google_ngram_to_canonical.R
@@ -0,0 +1,66 @@
+pacman::p_load(dplyr, data.table, stringi, R.utils)
+
+
+#letter <- 'b'
+
+get_google_ngram_data <- function(letter, ...){
+
+    loc <- sprintf('http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-%s.gz', letter) %>%
+        textreadr::download() 
+
+    R.utils::gunzip(loc, destname = sprintf('google_ngram/googlebooks-eng-all-1gram-20120701-%s', letter), remove = FALSE)
+
+}
+
+
+make_canonical_hash <- function(letter, ...){
+
+	dat <- fread(sprintf('google_ngram/googlebooks-eng-all-1gram-20120701-%s', letter), sep = '\t', header = FALSE)[, 1:3
+		][V2 == 2008,
+		][, V1 := tolower(stri_replace_all_regex(V1, '_[A-Z]+$', ''))
+		][stri_detect_regex(V1, '[^a-z\'-]', negate = TRUE),
+		][, list(V3 = sum(V3)), by = c('V1')
+		][, canonical := gsub("([a-z])(\\1+)", '\\1', V1, perl = TRUE)
+		][, cnt := .N, by = 'canonical'
+		][cnt > 1,
+		][, cnt := NULL
+		#][order(canonical, -V3)
+		][, .SD[which.max(V3)], by = 'canonical'
+		][, V3 := NULL
+		][]
+
+	setnames(dat, c('V1'), c('word'))
+	setkey(dat, 'canonical')
+
+	#setnames(dat, c('V1', 'V3'), c('word', 'n'))
+	#setcolorder(dat, c("canonical", "word", "n"))
+
+	dat
+}
+
+canonical <- lapply(letters, function(letter){
+    gc()
+    print(letter); flush.console()
+    try(get_google_ngram_data(letter))
+    try(make_canonical_hash(letter))
+})
+
+## check for errors
+canonical %>%
+    sapply(inherits, 'try-error') %>%
+    sum()
+
+
+canonical %>%
+    rbindlist() %>%
+    unique() %>%
+    saveRDS('canonical.rds')
+
+
+canonical <- readRDS('C:\\Users\\Tyler\\Desktop/canonical.rds')
+
+devtools::use_data(canonical, internal = TRUE)
+
+
+
+