Skip to content

Commit

Permalink
added count_string and count_vector
Browse files Browse the repository at this point in the history
  • Loading branch information
trinker committed Aug 3, 2015
1 parent a491ae0 commit 6a03273
Show file tree
Hide file tree
Showing 16 changed files with 365 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ inst/syllable_logo
inst/staticdocs
inst/extra_statdoc
inst/maintenance.R

inst/syllable_dictionary_scraping
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Generated by roxygen2 (4.1.1): do not edit by hand

export(compute_syllable_counts)
export(count_string)
export(lookup_syllable_counts)
export(sum_string)
importFrom(data.table,setDT)
1 change: 1 addition & 0 deletions R/compute_syllable_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
compute_syllable_counts <- function(x) {

m <- gsub("[ye]{1,2}ing", "XX", tolower(x))
m <- gsub("'s", "", m)
m <- gsub("ie(st|r)$", "XX", m)
m <- gsub("([aeiouy][^td]*?)ed$", "\\1", m)
m <- gsub("ely$", "ly", m)
Expand Down
28 changes: 28 additions & 0 deletions R/count_string.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#' String Syllables Counts
#'
#' Syllable counts for the words in a single string.
#'
#' @param x A character string.
#' @param \ldots ignored
#' @return Returns a vector of integer counts.
#' @export
#' @examples
#' count_string("I like chicken and eggs for breakfast")
#' count_string(hamlets_soliloquy)
#' library(stringi)
#'
#' data.frame(
#' word = stri_extract_all_words(stri_trans_tolower(hamlets_soliloquy))[[1]],
#' syllables = count_string(hamlets_soliloquy)
#' )
count_string <- function(x, ...){

if (length(x) > 1) {
stop("`count_string` operates on a string.\n",
"Consider using `count_vector` instead")
}

syllable_count_long_vector(x)
}


21 changes: 21 additions & 0 deletions R/sum_string.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#' String Syllables Sum
#'
#' Syllable sum for the words in a single string.
#'
#' @param x A character string.
#' @param \ldots ignored
#' @return Returns a single integer of the total number of syllables in the string.
#' @export
#' @examples
#' sum_string("I like chicken and eggs for breakfast")
#' sum_string(hamlets_soliloquy)
sum_string <- function(x, ...){

if (length(x) > 1) {
stop("`count_string` operates on a string.\n",
"Consider using `count_vector` instead")
}

sum(syllable_count_long_vector(x), na.rm = TRUE)
}

33 changes: 33 additions & 0 deletions R/syllable-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,36 @@ NULL
NULL




#' Hamlet's Soliloquy
#'
#' A dataset containing a character string of Hamlet's soliloquy.
#'
#' @docType data
#' @keywords datasets
#' @name hamlets_soliloquy
#' @usage data(hamlets_soliloquy)
#' @format A character vector with 1 element
NULL


#' 2012 U.S. Presidential Debates
#'
#' A dataset containing a cleaned version of all three presidential debates for
#' the 2012 election.
#'
#' @details
#' \itemize{
#' \item person. The speaker
#' \item tot. Turn of talk
#' \item dialogue. The words spoken
#' \item time. Variable indicating which of the three debates the dialogue is from
#' }
#'
#' @docType data
#' @keywords datasets
#' @name presidential_debates_2012
#' @usage data(presidential_debates_2012)
#' @format A data frame with 2912 rows and 4 variables
NULL
75 changes: 72 additions & 3 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,76 @@ hash_lookup_helper <- function(terms, key) {


make_words <- function(x){
x <- gsub("^\\s+|\\s+$|[^\\p{L}' -]+", "", tolower(x), perl = TRUE)
x <- paste(x, collapse = " & ")
stringi::stri_split_regex(x, "\\s+")
stringi::stri_extract_all_words(gsub("\\d", "", stringi::stri_trans_tolower(x)))
}

count_row_length <- function(x){
x <- stringi::stri_count_words(gsub("\\d", "", x))
x[is.na(x) | x == 0] <- 1
x
}


relist_vector <- function(vector, lens){
ends <- cumsum(as.numeric(lens))
starts <- c(1, utils::head(c(ends + 1), -1))
Map(function(s, e) {vector[s:e]}, starts, ends)
}

add_row_id <- function(lens){
rep(seq_along(lens), lens)
}

syllable_count_long_df <- function(x){

# found the number of words per string
lens <- count_row_length(x)

# split into bag of words
y <- unlist(make_words(x))

# find NAs
NAs <- which(is.na(y))

# lookup syllable counts
counts <- lookup_syllable_counts(y)

# find words that could not be found
not_found <- which(is.na(counts))
not_found <- not_found[!not_found %in% NAs]

# compute syllable count on not found words
counts[not_found] <- compute_syllable_counts(y[not_found])

# make a syllable dataframe long version
data.frame(
string_number = add_row_id(lens),
count = counts,
stringsAsFactors = FALSE

)

}


syllable_count_long_vector <- function(x){

# split into bag of words
y <- unlist(make_words(x))

# find NAs
NAs <- which(is.na(y))

# lookup syllable counts
counts <- lookup_syllable_counts(y)

# find words that could not be found
not_found <- which(is.na(counts))
not_found <- not_found[!not_found %in% NAs]

# compute syllable count on not found words
counts[not_found] <- compute_syllable_counts(y[not_found])

counts

}
Binary file added data/hamlets_soliloquy.rda
Binary file not shown.
Binary file added data/presidential_debates_2012.rda
Binary file not shown.
100 changes: 100 additions & 0 deletions inst/syllable_dictionary_scraping/scrape_syllables.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
p_load(dplyr, XML, qdapDictionaries, parallel)

#syllable_df <- list2df(as.list.environment(qdap::env.syl), "syllables", "word")[, 2:1]
#no_syllable <- GradyAugmented[!GradyAugmented %in% syllable_df$word]

syllable_scrape <- function(x){
URL <- sprintf('http://www.poetrysoup.com/syllables/%s', x)
Table <- XML::readHTMLTable(URL, FALSE, which=2, stringsAsFactors = FALSE)[1:2, 2]
NAdet <- function(x) grepl("^N/A$", x)
data.frame(word = x, syllables = as.numeric(Table[1]),
in_dict = NAdet(Table[2]), stringsAsFactors = FALSE)
}


syllable_scrape('jo-ann')


words <- sort(unique(tolower(c(GradyAugmented, tolower(NAMES[[1]])))))
iter_list <- split((iter <- seq_along(words)), cut(iter, 20))
n <- 1

n <- 1 + n; n
#=====================
tic()
###########


cl <- makeCluster(mc <- getOption("cl.cores", 6))
clusterExport(cl=cl, varlist=c("words", "syllable_scrape"), envir=environment())

sylls <- parallel::parLapply(cl, iter_list[[n]], function(i) {

Sys.sleep(.1)
try(syllable_scrape(words[i]))

})

saveRDS(sylls, file = sprintf("sylls/sylls_%s.rds", n))
stopCluster(cl)


redos <- sapply(sylls, inherits, "try-error")

grab_again <- words[iter_list[[n]]][redos]
if (!identical(character(0), grab_again)){

sylls[redos] <- lapply(grab_again, function(x){
Sys.sleep(.1)
try(syllable_scrape(x))
})

(wrds <- words[iter_list[[n]]][sapply(sylls, inherits, "try-error")])
cat(sprintf("%s errors\n", length(wrds)))
if (length(wrds) > 0) cat(paste(wrds, collapse =", "), "\n")

sylls[sapply(sylls, inherits, "try-error")] <- lapply(wrds, function(x){
out <- syllable_count(x)
data_frame(word = out[1,1], syllables = out[1, 2], in_dict = ifelse(out[1,3] == "NF", FALSE, TRUE))
})

saveRDS(sylls, file = sprintf("sylls/sylls_%s.rds", n))
} else {
cat("no errors\n")
}
#############
toc()
#=====================


syll_df <- dplyr::rbind_all(sylls[!sapply(sylls, inherits, "try-error")])
saveRDS(syll_df, file = sprintf("syll_df/syll_df_%s.rds", n))


syllable_counts_data <- dplyr::rbind_all(lapply(1:20, function(i){

readRDS(sprintf("syll_df/syll_df_%s.rds", i))
}))




sylls_original <- syllable_counts_data %>%
filter(!in_dict) %>%
select(word) %>%
inner_join(syllable_df) %>%
`[`(1:47,)


syllable_counts_data <- syllable_counts_data[!syllable_counts_data[["word"]] %in% sylls_original[["word"]], ]

sylls_original[["in_dict"]] <- TRUE

syllable_counts_data <- dplyr::rbind_all(list(syllable_counts_data, sylls_original)) %>%
arrange(word) %>%
select(-in_dict) %>%
as.data.frame(, stringsAsFactors = FALSE)


data.table::setDT(syllable_counts_data)
data.table::setkey(syllable_counts_data, word)
30 changes: 30 additions & 0 deletions man/count_string.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/count_string.R
\name{count_string}
\alias{count_string}
\title{String Syllables Counts}
\usage{
count_string(x, ...)
}
\arguments{
\item{x}{A character string.}

\item{\ldots}{ignored}
}
\value{
Returns a vector of integer counts.
}
\description{
Syllable counts for the words in a single string.
}
\examples{
count_string("I like chicken and eggs for breakfast")
count_string(hamlets_soliloquy)
library(stringi)

data.frame(
word = stri_extract_all_words(stri_trans_tolower(hamlets_soliloquy))[[1]],
syllables = count_string(hamlets_soliloquy)
)
}

15 changes: 15 additions & 0 deletions man/hamlets_soliloquy.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/syllable-package.R
\docType{data}
\name{hamlets_soliloquy}
\alias{hamlets_soliloquy}
\title{Hamlet's Soliloquy}
\format{A character vector with 1 element}
\usage{
data(hamlets_soliloquy)
}
\description{
A dataset containing a character string of Hamlet's soliloquy.
}
\keyword{datasets}

24 changes: 24 additions & 0 deletions man/presidential_debates_2012.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/syllable-package.R
\docType{data}
\name{presidential_debates_2012}
\alias{presidential_debates_2012}
\title{2012 U.S. Presidential Debates}
\format{A data frame with 2912 rows and 4 variables}
\usage{
data(presidential_debates_2012)
}
\description{
A dataset containing a cleaned version of all three presidential debates for
the 2012 election.
}
\details{
\itemize{
\item person. The speaker
\item tot. Turn of talk
\item dialogue. The words spoken
\item time. Variable indicating which of the three debates the dialogue is from
}
}
\keyword{datasets}

24 changes: 24 additions & 0 deletions man/sum_string.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/sum_string.R
\name{sum_string}
\alias{sum_string}
\title{String Syllables Sum}
\usage{
sum_string(x, ...)
}
\arguments{
\item{x}{A character string.}

\item{\ldots}{ignored}
}
\value{
Returns a single integer of the total number of syllables in the string.
}
\description{
Syllable sum for the words in a single string.
}
\examples{
sum_string("I like chicken and eggs for breakfast")
sum_string(hamlets_soliloquy)
}

7 changes: 7 additions & 0 deletions tests/testthat/test-count_string.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
context("Checking count_string")

test_that("count_string ...",{


})

0 comments on commit 6a03273

Please sign in to comment.