* hash_internet_slag added as a lexicon to map slang to understood …

…meaning.
trinker · Dec 21, 2017 · 589e322 · 589e322
1 parent 5c13c3c
commit 589e322
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: lexicon
 Title: Lexicons for Text Analysis
-Version: 0.7.1
+Version: 0.7.2
 Authors@R: c(person("Tyler", "Rinker", email = "tyler.rinker@gmail.com", role = c("aut", "cre")))
 Maintainer: Tyler Rinker <tyler.rinker@gmail.com>
 Description: A collection of lexical hash tables, dictionaries, and word lists.
 Depends: R (>= 3.2.2)
 Imports: data.table, syuzhet (>= 1.0.1)
-Date: 2017-12-16
+Date: 2017-12-20
 License: MIT + file LICENSE
 LazyData: TRUE
 Roxygen: list(wrap = FALSE)
@@ -25,6 +25,7 @@ Collate:
     'grady_augmented.R'
     'hash_emoticons.R'
     'hash_grady_pos.R'
+    'hash_internet_slang.R'
     'hash_lemmas.R'
     'hash_power.R'
     'hash_sentiment_emojis.R'

diff --git a/NEWS b/NEWS
@@ -31,6 +31,8 @@ NEW FEATURES
 * `hash_sentiment_socal_google` sentiment hash table added for use in the
   **sentimentr** package.
 
+* `hash_internet_slag` added as a lexicon to map slang to understood meaning.
+
 MINOR FEATURES
 
 IMPROVEMENTS

diff --git a/NEWS.md b/NEWS.md
@@ -31,6 +31,8 @@ lexicon 0.7.0 -
 * `hash_sentiment_socal_google` sentiment hash table added for use in the
   **sentimentr** package.
 
+* `hash_internet_slag` added as a <a href="https://github.com/trinker/lexicon" target="_blank">lexicon</a> to map slang to understood meaning.
+
 **MINOR FEATURES**
 
 **IMPROVEMENTS**

diff --git a/R/hash_internet_slang.R b/R/hash_internet_slang.R
@@ -0,0 +1,18 @@
+#' List of Internet Slang and Corresponding Meanings
+#'
+#' A dataset containing Internet slang terms and corresponding meaning.  The data
+#' set is an augmented version of \url{http://www.smart-words.org/abbreviations/text.html}.
+#'
+#' @details
+#' \itemize{
+#'   \item x. The slang term.
+#'   \item y. The meaning.
+#' }
+#'
+#' @docType data
+#' @keywords datasets
+#' @name hash_internet_slang
+#' @usage data(hash_internet_slang)
+#' @format A data frame with 175 rows and 2 variables
+#' @references \url{http://www.smart-words.org/abbreviations/text.html}
+NULL
diff --git a/data/hash_internet_slang.rda b/data/hash_internet_slang.rda
diff --git a/inst/CITATION b/inst/CITATION
@@ -5,11 +5,11 @@ citEntry(entry = "manual",
     title = "{lexicon}: Lexicon Data",
     author = "Tyler W. Rinker",
     address = "Buffalo, New York",
-    note = "version 0.7.1",
+    note = "version 0.7.2",
     year = "2017",
     url = "http://github.com/trinker/lexicon",
     textVersion  = paste("Rinker, T. W. (2017).",
         "lexicon: Lexicon Data",
-        "version 0.7.1.",
+        "version 0.7.2.",
         "http://github.com/trinker/lexicon")
 )
diff --git a/inst/scraping_scripts/slang_scrape.R b/inst/scraping_scripts/slang_scrape.R
@@ -0,0 +1,81 @@
+if (!require("pacman")) install.packages("pacman"); library(pacman)
+pacman::p_load(rvest, magrittr, dplyr, xml2, robotstxt, textshape)
+
+url <- 'http://www.smart-words.org/abbreviations/text.html'
+
+## check robots.txt
+paths_allowed(url)
+
+rtxt <- robotstxt(domain='smart-words.org')
+rtxt
+
+
+## scrape 
+
+url %>%
+    read_html() -> txt
+
+## Parse
+
+##break out into lengthened x (2 in one cell)
+## replace slash parts
+## remove stupid ones (SO)
+## add missing B4
+hash_internet_slang <- txt  %>%
+    html_nodes(xpath = "//table[@class='tabdin']") %>%
+    html_table() %>%
+    setNames(c(rep('Acronym', 2), rep('Abbreviation', 2))) %>%
+    #tidy_list('Type') %>%
+    bind_rows() %>%
+    filter(!grepl('^\\s*$', X1)) %>%
+    filter(!grepl('^Abbrev|Acron', X1)) %>%
+    setNames(c('x', 'y')) %>%
+    mutate(
+        x = x %>%
+            {gsub('(\\()([^)]+)(\\))', '\\2', .)} %>%
+            textclean::mgsub(c('J/K', 'N/A'), c('J/K / JK', 'N/A / NA')) %>%
+            stringi::stri_split_regex(' / | '),
+        y = y %>%
+            tolower() %>%
+            {gsub('[.?!, ]+$', '', .)} %>%
+            trimws() %>%
+            stringi::stri_replace_first_regex(
+                '(fuck the world / )|(remember / )|( / remember)|( / honest)|(available / )|( \\(fr.+$)|(, original post)', 
+                ''
+            ) %>%
+            textclean::mgsub(
+                c(' / sex / ', 'thread / .. text / .. transmission', 'whoomp, there it is; meaning "hooray"', 'its friday', 'fine manual', '(just)'), 
+                c(', sex, ', 'text', 'hooray', "it's friday", 'fucking manual', 'just')
+            ) %>%
+            stringi::stri_replace_first_regex('\\s*/.+$', '')
+    ) %>%
+    filter(!x %in% c('SO')) %>%
+    tidyr::unnest() %>%
+    select(x, y) %>%
+    mutate(
+        y = case_when(x == "FYI" ~ 'for your information', TRUE ~ y) %>%
+            trimws(),
+        x = x %>% trimws()
+    ) %>%
+    rbind(
+        data_frame(
+            x = c('B4', 'LMAO', 'LMFAO', 'LMGTFY', 'LMK', 'LULZ', 'NVMD', 'PITA', 'PPL', 'TBH', 'TIL',
+                'YOLO', 'KMS', 'KTHX', 'G2G', 'ETA', 'CYA', 'C-YA', 'BFN', 'BBS'
+            ),
+            y = c('before', 'laughing my ass off', 'laughing my fucking ass off', 'let me google that for you',
+                'let me know', 'laugh out loud at bad evil', 'nevermind', 'pain in the ass', 'people',
+                'to be honest', 'today i learned', 'you only live once', 'kill myself', 'ok thanks',
+                'got to go', 'estimated time of arrival', rep('see you later', 2), 'bye for now', 'be back soon'
+            )
+        )
+    ) %>%
+    arrange(x) %>%
+    distinct() %>%
+    filter(y != 'end of day') %>%
+    data.table::data.table()
+
+data.table::setkey(hash_internet_slang, "x")
+hash_internet_slang['YMMV'] 
+
+pax::new_data(hash_internet_slang, stand.alone = TRUE)
+
diff --git a/man/hash_internet_slang.Rd b/man/hash_internet_slang.Rd