Skip to content

Commit

Permalink
added tidy_dtm & tidy_colo_dtm close #3
Browse files Browse the repository at this point in the history
  • Loading branch information
trinker committed Dec 26, 2016
1 parent 4f06580 commit 69a5d70
Show file tree
Hide file tree
Showing 15 changed files with 447 additions and 49 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ Authors@R: c(person("Tyler", "Rinker", email =
Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
Description: Tools that can be used to reshape text data.
Depends: R (>= 3.2.2)
Imports: data.table, stringi, utils
Imports: data.table, slam, stringi, utils
Suggests: testthat
Date: 2016-12-22
Date: 2016-12-25
License: GPL-2
LazyData: TRUE
Roxygen: list(wrap = FALSE)
Expand Down
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ S3method(split_token,data.frame)
S3method(split_token,default)
S3method(split_word,data.frame)
S3method(split_word,default)
S3method(unique_pairs,data.table)
S3method(unique_pairs,default)
export(as_list)
export(bind_list)
export(bind_table)
Expand All @@ -45,5 +47,10 @@ export(split_token)
export(split_transcript)
export(split_word)
export(starts)
export(tidy_colo_dtm)
export(tidy_colo_tdm)
export(tidy_dtm)
export(tidy_tdm)
export(unique_pairs)
importFrom(data.table,":=")
importFrom(data.table,.N)
24 changes: 23 additions & 1 deletion R/textshape-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,28 @@ NULL
#' @usage data(hamlet)
#' @format A data frame with 2007 rows and 7 variables
#' @references
#' \url{http://www.gutenberg.org}
#' http://www.gutenberg.org
NULL



#' Simple \code{\link[tm]{DocumentTermMatrix}}
#'
#' A dataset containing a simple \code{\link[tm]{DocumentTermMatrix}}.
#'
#' @details
#' \describe{
#' \item{i}{The document locations}
#' \item{j}{The term locations}
#' \item{v}{The count of terms for that particular element position}
#' \item{nrow}{The number of rows}
#' \item{ncol}{The numebr of columns}
#' \item{dimnames}{docuemnt and terms}
#' }
#'
#' @docType data
#' @keywords datasets
#' @name simple_dtm
#' @usage data(simple_dtm)
#' @format A list with 6 elements
NULL
67 changes: 67 additions & 0 deletions R/tidy_colo_dtm.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#' Convert a \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} into Collocating Words in Tidy Form
#'
#' Converts non-zero elements of a
#' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} into
#' a tidy data set made of collocating words.
#'
#' @param x A \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
#' @param \ldots ignored.
#' @return Returns a tidied data.frame.
#' @rdname tidy_colo_dtm
#' @export
#' @seealso \code{\link[textshape]{unique_pairs}}
#' @examples
#' tidied <- tidy_colo_dtm(simple_dtm)
#' tidied
#' unique_pairs(tidied)
#'
#' \dontrun{
#' if (!require("pacman")) install.packages("pacman")
#' pacman::p_load_current_gh('trinker/gofastr', 'trinker/lexicon')
#' pacman::p_load(tidyverse, magrittr, ggstance)
#'
#' my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
#'
#' tidy_colo_dtm(my_dtm) %>%
#' tbl_df() %>%
#' filter(!term_1 %in% c('i', lexicon::sw_onix) & !term_2 %in% lexicon::sw_onix) %>%
#' filter(term_1 != term_2) %>%
#' unique_pairs() %>%
#' filter(n > 15) %>%
#' complete(term_1, term_2, fill = list(n = 0)) %>%
#' ggplot(aes(x = term_1, y = term_2, fill = n)) +
#' geom_tile() +
#' scale_fill_gradient(low= 'white', high = 'red') +
#' theme(axis.text.x = element_text(angle = 45, hjust = 1))
#' }
tidy_colo_tdm <- function(x, ...){

term_1 <- NULL

x <- slam::as.simple_triplet_matrix(slam::tcrossprod_simple_triplet_matrix(x, y = NULL))

data.table::data.table(
term_1 = x[['dimnames']][['Terms']][x[['i']]],
term_2 = x[['dimnames']][['Terms']][x[['j']]],
n = x[['v']]
)[order(term_1), ]
}



#' @rdname tidy_colo_dtm
#' @export
tidy_colo_dtm <- function(x, ...){

term_1 <- NULL

x <- slam::as.simple_triplet_matrix(slam::crossprod_simple_triplet_matrix(x, y = NULL))

data.table::data.table(
term_1 = x[['dimnames']][['Terms']][x[['i']]],
term_2 = x[['dimnames']][['Terms']][x[['j']]],
n = x[['v']]
)[order(term_1), ]
}


77 changes: 77 additions & 0 deletions R/tidy_dtm.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#' Convert a \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}
#' into Tidy Form
#'
#' Converts non-zero elements of a
#' \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}} into
#' a tidy data set.
#'
#'
#' @param x A \code{\link[tm]{DocumentTermMatrix}}/\code{\link[tm]{TermDocumentMatrix}}.
#' @param \ldots ignored.
#' @return Returns a tidied data.frame.
#' @rdname tidy_dtm
#' @export
#' @examples
#' data(simple_dtm)
#'
#' tidy_dtm(simple_dtm)
#'
#' \dontrun{
#' if (!require("pacman")) install.packages("pacman")
#' pacman::p_load_current_gh('trinker/gofastr')
#' pacman::p_load(tidyverse, magrittr, ggstance)
#'
#' my_dtm <- with(presidential_debates_2012, q_dtm(dialogue, paste(time, tot, sep = "_")))
#'
#' tidy_dtm(my_dtm) %>%
#' tidyr::extract(doc, c("time", "turn", "sentence"), "(\\d)_(\\d+)\\.(\\d+)") %>%
#' mutate(
#' time = as.numeric(time),
#' turn = as.numeric(turn),
#' sentence = as.numeric(sentence)
#' ) %>%
#' tbl_df() %T>%
#' print() %>%
#' group_by(time, term) %>%
#' summarize(n = sum(n)) %>%
#' group_by(time) %>%
#' arrange(desc(n)) %>%
#' slice(1:10) %>%
#' mutate(
#' term = factor(paste(term, time, sep = "__"),
#' levels = rev(paste(term, time, sep = "__")))
#' ) %>%
#' ggplot(aes(x = n, y = term)) +
#' geom_barh(stat='identity') +
#' facet_wrap(~time, ncol=2, scales = 'free_y') +
#' scale_y_discrete(labels = function(x) gsub("__.+$", "", x))
#' }
tidy_dtm <- function(x, ...){

doc <- NULL

data.table::data.table(
doc = x[['dimnames']][['Docs']][x[['i']]],
term = x[['dimnames']][['Terms']][x[['j']]],
n = x[['v']],
i = x[['i']],
j = x[['j']]
)[order(doc), ]
}



#' @rdname tidy_dtm
#' @export
tidy_tdm <- function(x, ...){

doc <- NULL

out <- data.table::data.table(
doc = x[['dimnames']][['Docs']][x[['j']]],
term = x[['dimnames']][['Terms']][x[['i']]],
n = x[['v']],
i = x[['j']],
j = x[['i']]
)[order(doc), ]
}
48 changes: 48 additions & 0 deletions R/unique_pairs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#' Extract Only Unique Pairs of Collocating Words in \code{\link[textshape]{tidy_colo_dtm}}
#'
#' \code{\link[textshape]{tidy_colo_dtm}} utilizes the entire matrix to generate
#' the tidied data.frame. This means that the upper and lower triangles are
#' used redundantly. This function eliminates this redundancy by dropping one
#' set of the pairs from a tidied data.frame.
#'
#' @param x A \code{\link[base]{data.frame}} with two columns that contain
#' redundant pairs.
#' @param col1 A string naming column 1.
#' @param col2 A string naming column 2.
#' @param \ldots ignored.
#' @return Returns a filtered \code{\link[base]{data.frame}}.
#' @export
#' @seealso \code{\link[textshape]{tidy_colo_dtm}}
#' @examples
#' dat <- data.frame(
#' term_1 = LETTERS[1:10],
#' term_2 = LETTERS[10:1],
#' stringsAsFactors = FALSE
#' )
#'
#' unique_pairs(dat)
unique_pairs <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {

UseMethod('unique_pairs')
}

#' @export
#' @rdname unique_pairs
#' @method unique_pairs default
unique_pairs.default <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {

x[!duplicated(apply(data.table::data.table(x[, c(col1, col2)]), 1, sorter)),]
}

#' @export
#' @rdname unique_pairs
#' @method unique_pairs data.table
unique_pairs.data.table <- function(x, col1 = 'term_1', col2 = 'term_2', ...) {

x[!duplicated(apply(data.table::data.table(x[, c(col1, col2), with = FALSE]), 1, sorter)),]
}

sorter <- function(x) paste(sort(x), collapse = "")



2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ split_word(x)

## Putting It Together

[Eduardo Flores](https://www.linkedin.com/in/eduardo-flores-16850523) blogged about [What the candidates say, analyzing republican debates using R](http://enelmargen.org/r-english/datascience/2015/11/26/What-the-candidates-say-analyzing-republican-debates-using-R2.html) where he demonstrated some scraping and analysis techniques. Here I highlight a combination usage of **textshape** tools to scrape and structure the text from 4 of the 2015 Republican debates within a [**magrittr**](https://github.com/smbache/magrittr) pipeline. The result is a single [**data.table**](https://github.com/Rdatatable/data.table) containing the dialogue from all 4 debates. The code highlights the conciseness and readability of **textshape** by restructuring Flores scraping with **textshape** replacements.
Eduardo Flores blogged about [What the candidates say, analyzing republican debates using R](https://www.r-bloggers.com/what-the-candidates-say-analyzing-republican-debates-using-r) where he demonstrated some scraping and analysis techniques. Here I highlight a combination usage of **textshape** tools to scrape and structure the text from 4 of the 2015 Republican debates within a [**magrittr**](https://github.com/smbache/magrittr) pipeline. The result is a single [**data.table**](https://github.com/Rdatatable/data.table) containing the dialogue from all 4 debates. The code highlights the conciseness and readability of **textshape** by restructuring Flores scraping with **textshape** replacements.

```{r}
if (!require("pacman")) install.packages("pacman")
Expand Down
75 changes: 33 additions & 42 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,30 +254,30 @@ columns in a `data.frame`.
bind_vector(x)

## id content
## 1: Alaska F
## 2: California A
## 3: Alabama B
## 4: Alaska B
## 5: Alabama E
## 1: Arkansas E
## 2: Alabama F
## 3: Alabama E
## 4: California A
## 5: Arizona F
## ---
## 996: Arizona B
## 997: Arizona A
## 998: Arkansas C
## 999: Arkansas C
## 1000: California B
## 996: Alaska F
## 997: Arizona B
## 998: Alabama D
## 999: Arizona E
## 1000: Alaska C

#### A Table

x <- table(sample(LETTERS[1:6], 1000, TRUE))
bind_table(x)

## id content
## 1: A 180
## 2: B 164
## 3: C 136
## 4: D 195
## 5: E 158
## 6: F 167
## 1: A 143
## 2: B 155
## 3: C 181
## 4: D 157
## 5: E 188
## 6: F 176

Combining
---------
Expand Down Expand Up @@ -392,29 +392,29 @@ counts.
(dat <- data.frame(matrix(sample(c("A", "B"), 30, TRUE), ncol=3)))

## X1 X2 X3
## 1 A B A
## 2 B A B
## 3 B B B
## 4 A A B
## 1 A A B
## 2 B B A
## 3 A A A
## 4 B A B
## 5 B A A
## 6 B B B
## 7 B A B
## 8 B A B
## 9 A A A
## 10 B A B
## 6 A B A
## 7 A B A
## 8 A B A
## 9 B B B
## 10 B B B

mtabulate(dat)

## A B
## X1 3 7
## X2 7 3
## X3 3 7
## X1 5 5
## X2 4 6
## X3 6 4

t(mtabulate(dat))

## X1 X2 X3
## A 3 7 3
## B 7 3 7
## A 5 4 6
## B 5 6 4

Spanning
--------
Expand Down Expand Up @@ -490,14 +490,6 @@ The `duration` function calculations start-end durations as n words.
#### Gantt Plot

library(ggplot2)

##
## Attaching package: 'ggplot2'

## The following object is masked from 'package:qdapRegex':
##
## %+%

ggplot(duration(DATA), aes(x = start, xend = end, y = person, yend = person, color = sex)) +
geom_segment(size=4) +
xlab("Duration (Words)") +
Expand Down Expand Up @@ -1318,10 +1310,9 @@ The `split_word` function splits data into words.
Putting It Together
-------------------

[Eduardo Flores](https://www.linkedin.com/in/eduardo-flores-16850523)
blogged about [What the candidates say, analyzing republican debates
using
R](http://enelmargen.org/r-english/datascience/2015/11/26/What-the-candidates-say-analyzing-republican-debates-using-R2.html)
Eduardo Flores blogged about [What the candidates say, analyzing
republican debates using
R](https://www.r-bloggers.com/what-the-candidates-say-analyzing-republican-debates-using-r)
where he demonstrated some scraping and analysis techniques. Here I
highlight a combination usage of **textshape** tools to scrape and
structure the text from 4 of the 2015 Republican debates within a
Expand Down
Binary file added data/simple_dtm.rda
Binary file not shown.
4 changes: 2 additions & 2 deletions inst/build.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ document(x)
install(x, quick = quick, build_vignettes = FALSE, dependencies = TRUE)

path <- find.package(pack)
system(paste(shQuote(file.path(R.home("bin"), "R")),
system(paste(shQuote(file.path(R.home("bin"), "R")),
"CMD", "Rd2pdf", shQuote(path)))

qman(repo, dir=loc)
setwd(curd)
message("Done!")
message("Done!")

0 comments on commit 69a5d70

Please sign in to comment.