-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
*
replace_word_elongation
added to replace word elongations (a.k.a.…
… "word lengthening") with the most likely normalized word form. See http://www.aclweb.org/anthology/D11-105 for details. Related to trinker/sentimentr#66
- Loading branch information
Showing
11 changed files
with
317 additions
and
78 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#' Replace Word Elongations | ||
#' | ||
#' In informal writing people may use a form of text embellishment to emphasize | ||
#' or alter word meanings called elongation (a.k.a. "word lengthening"). For | ||
#' example, the use of "Whyyyyy" conveys frustration. Other times the usage may | ||
#' be to be more sexy (e.g., "Heyyyy there"). Other times it may be used for | ||
#' emphasis (e.g., "This is so gooood"). This function uses an augmented form | ||
#' of Armstrong & Fogarty's (2007) algorithm. The algorithm first attempts to | ||
#' replace the elongation with known semantic replacements (optional; default is | ||
#' \code{FALSE}). After this the algorithm locates all places were the same | ||
#' letter (case insensitive) appears 3 times consecutively. These elements are | ||
#' then further processed. The matches are replaces via \code{fgsub} by first | ||
#' taking the elongation to it's canonical form (drop all > 1 consecutive letters | ||
#' to a single letter) and then replacing with the most common word used in 2008 | ||
#' in Google's ngram data set that takes the canonical form. If the canonical | ||
#' form is not found in the Google data set then the canonical form is used as | ||
#' the replacement. | ||
#' | ||
#' @param x The text variable. | ||
#' @param impart.meaning logical. If \code{TRUE}, known elongation semantics | ||
#' are used as replacements (see \code{textclean:::meaning_elongations} for | ||
#' known elongation semantics and replacements). | ||
#' @param \ldots ignored. | ||
#' @return Returns a vector with word elongations replaced. | ||
#' @references | ||
#' Armstrong, D. B., Fogarty, G. J., & Dingsdag, D. (2007). Scales measuring | ||
#' characteristics of small business information systems. Proceedings of the | ||
#' 2011 Conference on Empirical Methods in Natural Language Processing (pp. | ||
#' 562-570). Edinburgh, Scotland. Retrieved from | ||
#' http://www.aclweb.org/anthology/D11-1052 \cr \cr | ||
#' \url{http://storage.googleapis.com/books/ngrams/books/datasetsv2.html} \cr \cr | ||
#' \url{https://www.theatlantic.com/magazine/archive/2013/03/dragging-it-out/309220} \cr \cr | ||
#' \url{https://english.stackexchange.com/questions/189517/is-there-a-name-term-for-multiplied-vowels} | ||
#' @export | ||
#' @examples | ||
#' x <- c('look', 'noooooo!', 'real coooool!', "it's sooo goooood", 'fsdfds', | ||
#' 'fdddf', 'as', "aaaahahahahaha", "aabbccxccbbaa", 'I said heyyy!', | ||
#' "I'm liiiike whyyyyy me?", "Wwwhhatttt!", NA) | ||
#' | ||
#' replace_word_elongation(x) | ||
#' replace_word_elongation(x, impart.meaning = TRUE) | ||
replace_word_elongation <- function(x, impart.meaning = FALSE, ...){ | ||
|
||
## replace with meaningful | ||
if (isTRUE(impart.meaning)){ | ||
x <- mgsub(x, meaning_elongations[['x']], meaning_elongations[['y']], | ||
fixed = FALSE, perl = TRUE, ignore.case = TRUE) | ||
} | ||
|
||
## consider only groupings with a triple letter | ||
locs <- stringi::stri_detect_regex(x, elongation_search_pattern, | ||
opts_regex = list(case_insensitive = TRUE)) | ||
|
||
locs[is.na(locs)] <- FALSE | ||
|
||
if (sum(locs) == 0) return(x) | ||
|
||
txt <- x[locs] | ||
canonicalk <- data.table::data.table(canonical) | ||
|
||
## replace tripple letter words with most common form or else canonical form | ||
x[locs] <- .fgsub(txt, elongation_pattern, function(x, can = canonical){ | ||
|
||
y <- gsub("([a-z])(\\1+)", '\\1', tolower(x), perl = TRUE) | ||
|
||
z <- data.table::data.table(canonical = y) | ||
out <- merge(z, can, by = 'canonical')$word | ||
|
||
if (length(out) == 0 || is.na(out)) out <- y | ||
out | ||
|
||
}) | ||
|
||
x | ||
|
||
} | ||
|
||
|
||
|
||
# Known with meaning | ||
b2 <- "(?<=^|[^A-Za-z'-])(%s)(?=$|[^A-Za-z'-])" | ||
meaning_elongations <- data.frame( | ||
x = sprintf(b2, c('hey{2,}', 'fi{3,}ne', 'no{3,}', 'sor{3,}y|sory{2,}|sor{3,}y{2,}', | ||
'thanks{2,}', 'tha{2,}nks', 'ri{3,}ght', 'why{3,}', 'real{2,}y' | ||
)), | ||
y = c('hey sexy', 'not fine', 'sarcastic', 'not sorry', | ||
'not thankful', 'very thankful', 'not correct', 'frustration', 'surprised' | ||
), | ||
stringsAsFactors = FALSE | ||
) | ||
|
||
elongation_search_pattern <- "(?i)([a-z])(\\1{2,})" | ||
elongation_pattern <- "\\b\\w*([a-z])(\\1{2,})\\w*\\b" | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.