added count_string and count_vector

trinker · Aug 3, 2015 · 6a03273 · 6a03273
1 parent a491ae0
commit 6a03273
Show file tree

Hide file tree

Showing 16 changed files with 365 additions and 4 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -18,4 +18,4 @@ inst/syllable_logo
 inst/staticdocs
 inst/extra_statdoc
 inst/maintenance.R
-
+inst/syllable_dictionary_scraping
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,7 @@
 # Generated by roxygen2 (4.1.1): do not edit by hand
 
 export(compute_syllable_counts)
+export(count_string)
 export(lookup_syllable_counts)
+export(sum_string)
 importFrom(data.table,setDT)
diff --git a/R/compute_syllable_counts.R b/R/compute_syllable_counts.R
@@ -18,6 +18,7 @@
 compute_syllable_counts <- function(x) {
 
     m <- gsub("[ye]{1,2}ing", "XX", tolower(x))
+    m <- gsub("'s", "", m)
     m <- gsub("ie(st|r)$", "XX", m)
     m <- gsub("([aeiouy][^td]*?)ed$", "\\1", m)
     m <- gsub("ely$", "ly", m)

diff --git a/R/count_string.R b/R/count_string.R
@@ -0,0 +1,28 @@
+#' String Syllables Counts
+#'
+#' Syllable counts for the words in a single string.
+#'
+#' @param x A character string.
+#' @param \ldots ignored
+#' @return Returns a vector of integer counts.
+#' @export
+#' @examples
+#' count_string("I like chicken and eggs for breakfast")
+#' count_string(hamlets_soliloquy)
+#' library(stringi)
+#'
+#' data.frame(
+#'     word = stri_extract_all_words(stri_trans_tolower(hamlets_soliloquy))[[1]],
+#'     syllables = count_string(hamlets_soliloquy)
+#' )
+count_string <- function(x, ...){
+
+    if (length(x) > 1) {
+        stop("`count_string` operates on a string.\n",
+        "Consider using `count_vector` instead")
+    }
+
+    syllable_count_long_vector(x)
+}
+
+
diff --git a/R/sum_string.R b/R/sum_string.R
@@ -0,0 +1,21 @@
+#' String Syllables Sum
+#'
+#' Syllable sum for the words in a single string.
+#'
+#' @param x A character string.
+#' @param \ldots ignored
+#' @return Returns a single integer of the total number of syllables in the string.
+#' @export
+#' @examples
+#' sum_string("I like chicken and eggs for breakfast")
+#' sum_string(hamlets_soliloquy)
+sum_string <- function(x, ...){
+
+    if (length(x) > 1) {
+        stop("`count_string` operates on a string.\n",
+        "Consider using `count_vector` instead")
+    }
+
+    sum(syllable_count_long_vector(x), na.rm = TRUE)
+}
+
diff --git a/R/syllable-package.R b/R/syllable-package.R
@@ -27,3 +27,36 @@ NULL
 NULL
 
 
+
+
+#' Hamlet's Soliloquy
+#'
+#' A dataset containing a character string of Hamlet's soliloquy.
+#'
+#' @docType data
+#' @keywords datasets
+#' @name hamlets_soliloquy
+#' @usage data(hamlets_soliloquy)
+#' @format A character vector with 1 element
+NULL
+
+
+#' 2012 U.S. Presidential Debates
+#'
+#' A dataset containing a cleaned version of all three presidential debates for
+#' the 2012 election.
+#'
+#' @details
+#' \itemize{
+#'   \item person. The speaker
+#'   \item tot. Turn of talk
+#'   \item dialogue. The words spoken
+#'   \item time. Variable indicating which of the three debates the dialogue is from
+#' }
+#'
+#' @docType data
+#' @keywords datasets
+#' @name presidential_debates_2012
+#' @usage data(presidential_debates_2012)
+#' @format A data frame with 2912 rows and 4 variables
+NULL
diff --git a/R/utils.R b/R/utils.R
@@ -9,7 +9,76 @@ hash_lookup_helper <- function(terms, key) {
 
 
 make_words <- function(x){
-    x <- gsub("^\\s+|\\s+$|[^\\p{L}' -]+", "", tolower(x), perl = TRUE)
-    x <- paste(x, collapse = " & ")
-    stringi::stri_split_regex(x, "\\s+")
+    stringi::stri_extract_all_words(gsub("\\d", "", stringi::stri_trans_tolower(x)))
+}
+
+count_row_length <- function(x){
+    x <- stringi::stri_count_words(gsub("\\d", "", x))
+    x[is.na(x) | x == 0] <- 1
+    x
+}
+
+
+relist_vector <- function(vector, lens){
+    ends <- cumsum(as.numeric(lens))
+    starts <- c(1, utils::head(c(ends + 1), -1))
+    Map(function(s, e) {vector[s:e]}, starts, ends)
+}
+
+add_row_id <- function(lens){
+    rep(seq_along(lens), lens)
+}
+
+syllable_count_long_df <- function(x){
+
+    # found the number of words per string
+    lens <- count_row_length(x)
+
+    # split into bag of words
+    y <- unlist(make_words(x))
+
+    # find NAs
+    NAs <- which(is.na(y))
+
+    # lookup syllable counts
+    counts <- lookup_syllable_counts(y)
+
+    # find words that could not be found
+    not_found <- which(is.na(counts))
+    not_found <- not_found[!not_found %in% NAs]
+
+    # compute syllable count on not found words
+    counts[not_found] <- compute_syllable_counts(y[not_found])
+
+    # make a syllable dataframe long version
+    data.frame(
+        string_number = add_row_id(lens),
+        count = counts,
+        stringsAsFactors = FALSE
+
+    )
+
+}
+
+
+syllable_count_long_vector <- function(x){
+
+    # split into bag of words
+    y <- unlist(make_words(x))
+
+    # find NAs
+    NAs <- which(is.na(y))
+
+    # lookup syllable counts
+    counts <- lookup_syllable_counts(y)
+
+    # find words that could not be found
+    not_found <- which(is.na(counts))
+    not_found <- not_found[!not_found %in% NAs]
+
+    # compute syllable count on not found words
+    counts[not_found] <- compute_syllable_counts(y[not_found])
+
+    counts
+
 }
diff --git a/data/hamlets_soliloquy.rda b/data/hamlets_soliloquy.rda
diff --git a/data/presidential_debates_2012.rda b/data/presidential_debates_2012.rda
diff --git a/inst/syllable_dictionary_scraping/scrape_syllables.R b/inst/syllable_dictionary_scraping/scrape_syllables.R
@@ -0,0 +1,100 @@
+p_load(dplyr, XML, qdapDictionaries, parallel)
+
+#syllable_df <- list2df(as.list.environment(qdap::env.syl), "syllables", "word")[, 2:1]
+#no_syllable <- GradyAugmented[!GradyAugmented %in% syllable_df$word]
+
+syllable_scrape <- function(x){
+    URL <- sprintf('http://www.poetrysoup.com/syllables/%s', x)
+    Table <- XML::readHTMLTable(URL, FALSE, which=2, stringsAsFactors = FALSE)[1:2, 2] 
+    NAdet <- function(x) grepl("^N/A$", x)
+    data.frame(word = x, syllables = as.numeric(Table[1]), 
+        in_dict = NAdet(Table[2]), stringsAsFactors = FALSE)
+}
+
+
+syllable_scrape('jo-ann')
+
+
+words <- sort(unique(tolower(c(GradyAugmented, tolower(NAMES[[1]])))))
+iter_list <- split((iter <- seq_along(words)), cut(iter, 20))
+n <- 1
+
+n <- 1 + n; n
+#=====================
+tic()
+###########
+
+
+cl <- makeCluster(mc <- getOption("cl.cores", 6))
+clusterExport(cl=cl, varlist=c("words", "syllable_scrape"), envir=environment())
+
+sylls <- parallel::parLapply(cl, iter_list[[n]], function(i) {
+
+    Sys.sleep(.1)
+    try(syllable_scrape(words[i]))
+
+})
+
+saveRDS(sylls, file = sprintf("sylls/sylls_%s.rds", n))
+stopCluster(cl)  
+
+
+redos <- sapply(sylls, inherits, "try-error")
+
+grab_again <- words[iter_list[[n]]][redos]
+if (!identical(character(0), grab_again)){
+
+    sylls[redos] <- lapply(grab_again, function(x){
+        Sys.sleep(.1)
+        try(syllable_scrape(x))
+    })
+
+    (wrds <- words[iter_list[[n]]][sapply(sylls, inherits, "try-error")])
+    cat(sprintf("%s errors\n", length(wrds)))
+    if (length(wrds) > 0) cat(paste(wrds, collapse =", "), "\n")
+
+    sylls[sapply(sylls, inherits, "try-error")] <- lapply(wrds, function(x){
+        out <- syllable_count(x)
+        data_frame(word = out[1,1], syllables = out[1, 2], in_dict = ifelse(out[1,3] == "NF", FALSE, TRUE))
+    })
+
+    saveRDS(sylls, file = sprintf("sylls/sylls_%s.rds", n))
+} else {
+    cat("no errors\n")
+}
+#############
+toc()
+#=====================
+
+
+syll_df <- dplyr::rbind_all(sylls[!sapply(sylls, inherits, "try-error")])
+saveRDS(syll_df, file = sprintf("syll_df/syll_df_%s.rds", n))
+
+
+syllable_counts_data <- dplyr::rbind_all(lapply(1:20, function(i){
+
+    readRDS(sprintf("syll_df/syll_df_%s.rds", i))
+}))
+
+
+
+
+sylls_original <- syllable_counts_data %>%
+    filter(!in_dict) %>%
+    select(word) %>%
+    inner_join(syllable_df) %>%
+    `[`(1:47,) 
+
+
+syllable_counts_data <- syllable_counts_data[!syllable_counts_data[["word"]] %in% sylls_original[["word"]], ]
+
+sylls_original[["in_dict"]] <- TRUE 
+
+syllable_counts_data <- dplyr::rbind_all(list(syllable_counts_data, sylls_original)) %>%
+    arrange(word) %>%
+    select(-in_dict) %>%
+    as.data.frame(, stringsAsFactors = FALSE) 
+
+
+data.table::setDT(syllable_counts_data)
+data.table::setkey(syllable_counts_data, word) 
diff --git a/man/count_string.Rd b/man/count_string.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/count_string.R
+\name{count_string}
+\alias{count_string}
+\title{String Syllables Counts}
+\usage{
+count_string(x, ...)
+}
+\arguments{
+\item{x}{A character string.}
+
+\item{\ldots}{ignored}
+}
+\value{
+Returns a vector of integer counts.
+}
+\description{
+Syllable counts for the words in a single string.
+}
+\examples{
+count_string("I like chicken and eggs for breakfast")
+count_string(hamlets_soliloquy)
+library(stringi)
+
+data.frame(
+    word = stri_extract_all_words(stri_trans_tolower(hamlets_soliloquy))[[1]],
+    syllables = count_string(hamlets_soliloquy)
+)
+}
+
diff --git a/man/hamlets_soliloquy.Rd b/man/hamlets_soliloquy.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/syllable-package.R
+\docType{data}
+\name{hamlets_soliloquy}
+\alias{hamlets_soliloquy}
+\title{Hamlet's Soliloquy}
+\format{A character vector with 1 element}
+\usage{
+data(hamlets_soliloquy)
+}
+\description{
+A dataset containing a character string of Hamlet's soliloquy.
+}
+\keyword{datasets}
+
diff --git a/man/presidential_debates_2012.Rd b/man/presidential_debates_2012.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/syllable-package.R
+\docType{data}
+\name{presidential_debates_2012}
+\alias{presidential_debates_2012}
+\title{2012 U.S. Presidential Debates}
+\format{A data frame with 2912 rows and 4 variables}
+\usage{
+data(presidential_debates_2012)
+}
+\description{
+A dataset containing a cleaned version of all three presidential debates for
+the 2012 election.
+}
+\details{
+\itemize{
+  \item person. The speaker
+  \item tot. Turn of talk
+  \item dialogue. The words spoken
+  \item time. Variable indicating which of the three debates the dialogue is from
+}
+}
+\keyword{datasets}
+
diff --git a/man/sum_string.Rd b/man/sum_string.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/sum_string.R
+\name{sum_string}
+\alias{sum_string}
+\title{String Syllables Sum}
+\usage{
+sum_string(x, ...)
+}
+\arguments{
+\item{x}{A character string.}
+
+\item{\ldots}{ignored}
+}
+\value{
+Returns a single integer of the total number of syllables in the string.
+}
+\description{
+Syllable sum for the words in a single string.
+}
+\examples{
+sum_string("I like chicken and eggs for breakfast")
+sum_string(hamlets_soliloquy)
+}
+
diff --git a/tests/testthat/test-count_string.R b/tests/testthat/test-count_string.R
@@ -0,0 +1,7 @@
+context("Checking count_string")
+
+test_that("count_string ...",{
+
+
+})
+