added tidy_dtm & tidy_colo_dtm close #3

trinker · Dec 26, 2016 · 69a5d70 · 69a5d70
1 parent 4f06580
commit 69a5d70
Show file tree

Hide file tree

Showing 15 changed files with 447 additions and 49 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,9 +6,9 @@ Authors@R: c(person("Tyler", "Rinker", email =
 Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
 Description: Tools that can be used to reshape text data.
 Depends: R (>= 3.2.2)
-Imports: data.table, stringi, utils
+Imports: data.table, slam, stringi, utils
 Suggests: testthat
-Date: 2016-12-22
+Date: 2016-12-25
 License: GPL-2
 LazyData: TRUE
 Roxygen: list(wrap = FALSE)

diff --git a/NAMESPACE b/NAMESPACE
@@ -24,6 +24,8 @@ S3method(split_token,data.frame)
 S3method(split_token,default)
 S3method(split_word,data.frame)
 S3method(split_word,default)
+S3method(unique_pairs,data.table)
+S3method(unique_pairs,default)
 export(as_list)
 export(bind_list)
 export(bind_table)
@@ -45,5 +47,10 @@ export(split_token)
 export(split_transcript)
 export(split_word)
 export(starts)
+export(tidy_colo_dtm)
+export(tidy_colo_tdm)
+export(tidy_dtm)
+export(tidy_tdm)
+export(unique_pairs)
 importFrom(data.table,":=")
 importFrom(data.table,.N)
diff --git a/R/textshape-package.R b/R/textshape-package.R
@@ -49,6 +49,28 @@ NULL
 #' @usage data(hamlet)
 #' @format A data frame with 2007 rows and 7 variables
 #' @references
-#' \url{http://www.gutenberg.org}
+#' http://www.gutenberg.org
 NULL
 
+
+
+#' Simple \code{\link[tm]{DocumentTermMatrix}}
+#'
+#' A dataset containing a simple \code{\link[tm]{DocumentTermMatrix}}.
+#'
+#' @details
+#' \describe{
+#'   \item{i}{The document locations}
+#'   \item{j}{The term locations}
+#'   \item{v}{The count of terms for that particular element position}
+#'   \item{nrow}{The number of rows}
+#'   \item{ncol}{The numebr of columns}
+#'   \item{dimnames}{docuemnt and terms}
+#' }
+#'
+#' @docType data
+#' @keywords datasets
+#' @name simple_dtm
+#' @usage data(simple_dtm)
+#' @format A list with 6 elements
+NULL
diff --git a/R/tidy_colo_dtm.R b/R/tidy_colo_dtm.R
@@ -0,0 +1,67 @@
+#' Convert a \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} into Collocating Words in Tidy Form
+#'
+#' Converts non-zero elements of a
+#' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} into
+#' a tidy data set made of collocating words.
+#'
+#' @param x A \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
+#' @param \ldots ignored.
+#' @return Returns a tidied data.frame.
+#' @rdname tidy_colo_dtm
+#' @export
+#' @seealso \code{\link[textshape]{unique_pairs}}
+#' @examples
+#' tidied <- tidy_colo_dtm(simple_dtm)
+#' tidied
+#' unique_pairs(tidied)
+#'
+#' \dontrun{
+#' if (!require("pacman")) install.packages("pacman")
+#' pacman::p_load_current_gh('trinker/gofastr', 'trinker/lexicon')
+#' pacman::p_load(tidyverse, magrittr, ggstance)
+#'
+#' my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
+#'
+#' tidy_colo_dtm(my_dtm) %>%
+#'     tbl_df() %>%
+#'     filter(!term_1 %in% c('i', lexicon::sw_onix) & !term_2 %in% lexicon::sw_onix) %>%
+#'     filter(term_1 != term_2) %>%
+#'     unique_pairs() %>%
+#'     filter(n > 15) %>%
+#'     complete(term_1, term_2, fill = list(n = 0)) %>%
+#'     ggplot(aes(x = term_1, y = term_2, fill = n)) +
+#'         geom_tile() +
+#'         scale_fill_gradient(low= 'white', high = 'red') +
+#'         theme(axis.text.x = element_text(angle = 45, hjust = 1))
+#' }
+tidy_colo_tdm <- function(x, ...){
+
+    term_1 <- NULL
+
+    x <- slam::as.simple_triplet_matrix(slam::tcrossprod_simple_triplet_matrix(x, y = NULL))
+
+    data.table::data.table(
+        term_1 = x[['dimnames']][['Terms']][x[['i']]],
+        term_2 = x[['dimnames']][['Terms']][x[['j']]],
+        n = x[['v']]
+    )[order(term_1), ]
+}
+
+
+
+#' @rdname tidy_colo_dtm
+#' @export
+tidy_colo_dtm <- function(x, ...){
+
+    term_1 <- NULL
+
+    x <- slam::as.simple_triplet_matrix(slam::crossprod_simple_triplet_matrix(x, y = NULL))
+
+    data.table::data.table(
+        term_1 = x[['dimnames']][['Terms']][x[['i']]],
+        term_2 = x[['dimnames']][['Terms']][x[['j']]],
+        n = x[['v']]
+    )[order(term_1), ]
+}
+
+
diff --git a/R/tidy_dtm.R b/R/tidy_dtm.R
@@ -0,0 +1,77 @@
+#' Convert a \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
+#' into Tidy Form
+#'
+#' Converts non-zero elements of a
+#' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} into
+#' a tidy data set.
+#'
+#'
+#' @param x A \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
+#' @param \ldots ignored.
+#' @return Returns a tidied data.frame.
+#' @rdname tidy_dtm
+#' @export
+#' @examples
+#' data(simple_dtm)
+#'
+#' tidy_dtm(simple_dtm)
+#'
+#' \dontrun{
+#' if (!require("pacman")) install.packages("pacman")
+#' pacman::p_load_current_gh('trinker/gofastr')
+#' pacman::p_load(tidyverse, magrittr, ggstance)
+#'
+#' my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
+#'
+#' tidy_dtm(my_dtm) %>%
+#'     tidyr::extract(doc, c("time", "turn", "sentence"), "(\\d)_(\\d+)\\.(\\d+)") %>%
+#'     mutate(
+#'         time = as.numeric(time),
+#'         turn = as.numeric(turn),
+#'         sentence = as.numeric(sentence)
+#'     ) %>%
+#'     tbl_df() %T>%
+#'     print() %>%
+#'     group_by(time, term) %>%
+#'     summarize(n = sum(n)) %>%
+#'     group_by(time) %>%
+#'     arrange(desc(n)) %>%
+#'     slice(1:10) %>%
+#'     mutate(
+#'         term = factor(paste(term, time, sep = "__"),
+#'             levels = rev(paste(term, time, sep = "__")))
+#'     ) %>%
+#'     ggplot(aes(x = n, y = term)) +
+#'         geom_barh(stat='identity') +
+#'         facet_wrap(~time, ncol=2, scales = 'free_y') +
+#'         scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
+#' }
+tidy_dtm <- function(x, ...){
+
+    doc <- NULL
+
+    data.table::data.table(
+        doc = x[['dimnames']][['Docs']][x[['i']]],
+        term = x[['dimnames']][['Terms']][x[['j']]],
+        n = x[['v']],
+        i = x[['i']],
+        j = x[['j']]
+    )[order(doc), ]
+}
+
+
+
+#' @rdname tidy_dtm
+#' @export
+tidy_tdm <- function(x, ...){
+
+    doc <- NULL
+
+    out <- data.table::data.table(
+        doc = x[['dimnames']][['Docs']][x[['j']]],
+        term = x[['dimnames']][['Terms']][x[['i']]],
+        n = x[['v']],
+        i = x[['j']],
+        j = x[['i']]
+    )[order(doc), ]
+}
diff --git a/R/unique_pairs.R b/R/unique_pairs.R
@@ -0,0 +1,48 @@
+#' Extract Only Unique Pairs of Collocating Words in \code{\link[textshape]{tidy_colo_dtm}}
+#'
+#' \code{\link[textshape]{tidy_colo_dtm}} utilizes the entire matrix to generate
+#' the tidied data.frame.  This means that the upper and lower triangles are
+#' used redundantly.  This function eliminates this redundancy by dropping one
+#' set of the pairs from a tidied data.frame.
+#'
+#' @param x A \code{\link[base]{data.frame}} with two columns that contain
+#' redundant pairs.
+#' @param col1 A string naming column 1.
+#' @param col2 A string naming column 2.
+#' @param \ldots ignored.
+#' @return Returns a filtered \code{\link[base]{data.frame}}.
+#' @export
+#' @seealso \code{\link[textshape]{tidy_colo_dtm}}
+#' @examples
+#' dat <- data.frame(
+#'     term_1 = LETTERS[1:10],
+#'     term_2 = LETTERS[10:1],
+#'     stringsAsFactors = FALSE
+#' )
+#'
+#' unique_pairs(dat)
+unique_pairs <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
+
+    UseMethod('unique_pairs')
+}
+
+#' @export
+#' @rdname unique_pairs
+#' @method unique_pairs default
+unique_pairs.default <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
+
+    x[!duplicated(apply(data.table::data.table(x[, c(col1, col2)]), 1, sorter)),]
+}
+
+#' @export
+#' @rdname unique_pairs
+#' @method unique_pairs data.table
+unique_pairs.data.table <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {
+
+    x[!duplicated(apply(data.table::data.table(x[, c(col1, col2), with = FALSE]), 1, sorter)),]
+}
+
+sorter <- function(x) paste(sort(x), collapse = "")
+
+
+
diff --git a/README.Rmd b/README.Rmd
@@ -396,7 +396,7 @@ split_word(x)
 
 ## Putting It Together 
 
-[Eduardo Flores](https://www.linkedin.com/in/eduardo-flores-16850523) blogged about [What the candidates say, analyzing republican debates using R](http://enelmargen.org/r-english/datascience/2015/11/26/What-the-candidates-say-analyzing-republican-debates-using-R2.html) where he demonstrated some scraping and analysis techniques.  Here I highlight a combination usage of **textshape** tools to scrape and structure the text from 4 of the 2015 Republican debates within a [**magrittr**](https://github.com/smbache/magrittr) pipeline.  The result is a single [**data.table**](https://github.com/Rdatatable/data.table) containing the dialogue from all 4 debates.  The code highlights the conciseness and readability of **textshape** by restructuring Flores scraping with **textshape** replacements.
+Eduardo Flores blogged about [What the candidates say, analyzing republican debates using R](https://www.r-bloggers.com/what-the-candidates-say-analyzing-republican-debates-using-r) where he demonstrated some scraping and analysis techniques.  Here I highlight a combination usage of **textshape** tools to scrape and structure the text from 4 of the 2015 Republican debates within a [**magrittr**](https://github.com/smbache/magrittr) pipeline.  The result is a single [**data.table**](https://github.com/Rdatatable/data.table) containing the dialogue from all 4 debates.  The code highlights the conciseness and readability of **textshape** by restructuring Flores scraping with **textshape** replacements.
 
 ```{r}
 if (!require("pacman")) install.packages("pacman")

diff --git a/README.md b/README.md
@@ -254,30 +254,30 @@ columns in a `data.frame`.
     bind_vector(x)
 
     ##               id content
-    ##    1:     Alaska       F
-    ##    2: California       A
-    ##    3:    Alabama       B
-    ##    4:     Alaska       B
-    ##    5:    Alabama       E
+    ##    1:   Arkansas       E
+    ##    2:    Alabama       F
+    ##    3:    Alabama       E
+    ##    4: California       A
+    ##    5:    Arizona       F
     ##   ---                   
-    ##  996:    Arizona       B
-    ##  997:    Arizona       A
-    ##  998:   Arkansas       C
-    ##  999:   Arkansas       C
-    ## 1000: California       B
+    ##  996:     Alaska       F
+    ##  997:    Arizona       B
+    ##  998:    Alabama       D
+    ##  999:    Arizona       E
+    ## 1000:     Alaska       C
 
 #### A Table
 
     x <- table(sample(LETTERS[1:6], 1000, TRUE))
     bind_table(x)
 
     ##    id content
-    ## 1:  A     180
-    ## 2:  B     164
-    ## 3:  C     136
-    ## 4:  D     195
-    ## 5:  E     158
-    ## 6:  F     167
+    ## 1:  A     143
+    ## 2:  B     155
+    ## 3:  C     181
+    ## 4:  D     157
+    ## 5:  E     188
+    ## 6:  F     176
 
 Combining
 ---------
@@ -392,29 +392,29 @@ counts.
     (dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3)))
 
     ##    X1 X2 X3
-    ## 1   A  B  A
-    ## 2   B  A  B
-    ## 3   B  B  B
-    ## 4   A  A  B
+    ## 1   A  A  B
+    ## 2   B  B  A
+    ## 3   A  A  A
+    ## 4   B  A  B
     ## 5   B  A  A
-    ## 6   B  B  B
-    ## 7   B  A  B
-    ## 8   B  A  B
-    ## 9   A  A  A
-    ## 10  B  A  B
+    ## 6   A  B  A
+    ## 7   A  B  A
+    ## 8   A  B  A
+    ## 9   B  B  B
+    ## 10  B  B  B
 
     mtabulate(dat)
 
     ##    A B
-    ## X1 3 7
-    ## X2 7 3
-    ## X3 3 7
+    ## X1 5 5
+    ## X2 4 6
+    ## X3 6 4
 
     t(mtabulate(dat))
 
     ##   X1 X2 X3
-    ## A  3  7  3
-    ## B  7  3  7
+    ## A  5  4  6
+    ## B  5  6  4
 
 Spanning
 --------
@@ -490,14 +490,6 @@ The `duration` function calculations start-end durations as n words.
 #### Gantt Plot
 
     library(ggplot2)
-
-    ## 
-    ## Attaching package: 'ggplot2'
-
-    ## The following object is masked from 'package:qdapRegex':
-    ## 
-    ##     %+%
-
     ggplot(duration(DATA), aes(x = start, xend = end, y = person, yend = person, color = sex)) +
         geom_segment(size=4) +
         xlab("Duration (Words)") +
@@ -1318,10 +1310,9 @@ The `split_word` function splits data into words.
 Putting It Together
 -------------------
 
-[Eduardo Flores](https://www.linkedin.com/in/eduardo-flores-16850523)
-blogged about [What the candidates say, analyzing republican debates
-using
-R](http://enelmargen.org/r-english/datascience/2015/11/26/What-the-candidates-say-analyzing-republican-debates-using-R2.html)
+Eduardo Flores blogged about [What the candidates say, analyzing
+republican debates using
+R](https://www.r-bloggers.com/what-the-candidates-say-analyzing-republican-debates-using-r)
 where he demonstrated some scraping and analysis techniques. Here I
 highlight a combination usage of **textshape** tools to scrape and
 structure the text from 4 of the 2015 Republican debates within a

diff --git a/data/simple_dtm.rda b/data/simple_dtm.rda
diff --git a/inst/build.R b/inst/build.R
@@ -25,9 +25,9 @@ document(x)
 install(x, quick = quick, build_vignettes = FALSE, dependencies = TRUE)
 
 path <- find.package(pack)
-system(paste(shQuote(file.path(R.home("bin"), "R")), 
+system(paste(shQuote(file.path(R.home("bin"), "R")),
     "CMD", "Rd2pdf", shQuote(path)))
 
 qman(repo, dir=loc)
 setwd(curd)
-message("Done!")
+message("Done!")