# R Code

## Configuration

In [None]:
!pip install rpy2==3.5.1

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
## Install packages needed
install.packages("data.table")
install.packages("jsonlite")
install.packages("stringr")
install.packages("dplyr")
install.packages("magrittr")
install.packages("gutenbergr")
install.packages("textdata")
install.packages("tm")
install.packages("tidytext")
install.packages("janeaustenr")
install.packages("ggplot2")

In [None]:
%%R
## Load packages needed
require(data.table)
require(jsonlite)
require(stringr)
require(magrittr)
require(dplyr)
require(gutenbergr)
require(textdata)
require(tm)
require(tidytext)
require(janeaustenr)
require(ggplot2)

## Data Formats

### Vector

In [None]:
%%R
#### Vector ####

# Setting up a simple example dataset
text_df <- data.frame(doc=c(1, 2, 3),
                      text=c("Welcome to the RPM meeting",
                             "In this session we are covering text mining",
                             "By the end you'll be familiar with the basics of the tidytext package")
                      )
text_df


In [None]:
%%R
# Our text is a vector
text_df$text


### Corpus

In [None]:
%%R
#### Corpus ####

# We can convert our text vector to a corpus using functions from the tm package
text_corpus <- VCorpus(VectorSource(text_df$text))
text_corpus


In [None]:
%%R
text_corpus[[1]] # View first document

In [None]:
%%R
text_corpus[[1]]$meta # View metadata for first document

In [None]:
%%R
text_corpus[[1]]$content # View content for first document

In [None]:
%%R
# An example corpus of Reuters news articles from the tm package
data(acq)
acq

In [None]:
%%R
acq[[1]]$meta # View metadata for first document

In [None]:
%%R
acq[[1]]$content # View content for first document

In [None]:
%%R
# We can apply functions to documents within a corpus to clean up the text for analysis:
acq_lower <- tm_map(acq, content_transformer(tolower))

#Compare first document before...
acq[[1]]$content

In [None]:
%%R
# ... and after
acq_lower[[1]]$content

### Document Term Matrix

In [None]:
%%R
#### DTM ####

# Convert our corpus into a DTM
text_dtm <- DocumentTermMatrix(text_corpus)
inspect(text_dtm)


### Tidy

In [None]:
%%R
#### Tidy ####

# Convert our dataframe into a tidy data frame
# https://www.rdocumentation.org/packages/tidytext/versions/0.4.1/topics/unnest_tokens
text_tidy <- text_df %>%
  unnest_tokens(word, text)
text_tidy


### Converting between formats

In [None]:
%%R
# Above we showed vector->corpus, corpus->DTM, vector->tidy

# To convert from corpus -> vector/dataframe we can use the tidy function
text_corpus_df <- text_corpus %>%
  tidy()
text_corpus_df

In [None]:
%%R
# Since we now have metadata that we don't need for now, we can remove. Also, convert to dataframe instead of tibble
text_corpus_df <- text_corpus_df %>%
  select(id, text) %>%
  as.data.frame
text_corpus_df

In [None]:
%%R
# We can now convert this to tidytext (so for corpus -> tidy, we do corpus -> df -> tidy)
text_corpus_tidy <- text_corpus_df %>%
  unnest_tokens(word, text)
text_corpus_tidy

In [None]:
%%R
# To convert from tidy->DTM, we need to first summarize the tidytext data
text_summarized <- text_tidy %>%
  count(doc, word, sort = FALSE)
text_summarized

In [None]:
%%R
## The summarized table can be cast to DTM
text_tidy_dtm <- text_summarized %>%
  cast_dtm(document=doc, term=word, value=n)
inspect(text_tidy_dtm)

In [None]:
%%R
# We can check the unique terms in our matrix
Terms(text_tidy_dtm)

In [None]:
%%R
# To convert from DTM->tidy, we can again use the tidy function (note this drops
# words less than 2 characters long)
text_dtm_tidy <- tidy(text_dtm)
text_dtm_tidy


## Load Jane Austen data

In [None]:
%%R
# Reading in Jane Austen books from janeaustenr package

austen_books()
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         # Divide into chapters by searching for instances of 'Chapter X'
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
tidy_books


## Term Frequency

In [None]:
%%R
# Calculate the word frequency by novel and sort by most common words
book_words <- tidy_books %>%
  count(book, word, sort = TRUE)
book_words


In [None]:
%%R
# We saw common words that aren't very meaningful in analyzing text, like 'the'. We can remove a list of these "stop words"
data(stop_words)

tidy_books <- tidy_books %>%
  anti_join(stop_words)

stop_words

In [None]:
%%R
# Try term frequency again
book_words <- tidy_books %>%
  count(book, word, sort = TRUE)
book_words


In [None]:
%%R
# Check top words for Mansfield Park
book_words %>% filter(book == "Mansfield Park")

In [None]:
%%R
# Check top words for Emma
book_words %>% filter(book == "Emma")

In [None]:
%%R
# We can calculate the total number of words in each novel and join
total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
book_words


In [None]:
%%R
# Now we can calculate the frequency of each word in each novel as a percentage
freq <- book_words %>% 
  group_by(book) %>% 
  mutate(term_frequency = n/total) %>%
  ungroup()
freq


In [None]:
%%R
# We can use bind_tf_idf to calculate the tf_idf, which will give a better measure of the importance of each word
book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)
book_tf_idf


In [None]:
%%R
# Sorting by the TF IDF can help us identify words that most distinguish a document.
# In this case, they are all characters that are unique to each book, which
# makes a lot of sense.
book_tf_idf %>% arrange(-tf_idf)

## Sentiment Analysis

In [None]:
%%R
# Tidytext includes multiple sentiment dictionaries. We'll use the AFINN, which scores sentiment on -5 (negative) to +5 (positive)

afinn <- get_sentiments("afinn")


In [None]:
%%R
# Now, we will attach sentiments using inner_join (words with no sentiment in the dictionary would be dropped)
jane_austen_sentiment <- tidy_books %>% 
  inner_join(afinn)
jane_austen_sentiment


In [None]:
%%R
# Let's see which words in Pride & Prejudice are the most positive
most_positive <- jane_austen_sentiment %>%
  filter(book == "Pride & Prejudice") %>%
  arrange(-value) %>%
  head(10)
most_positive

In [None]:
%%R
# Now what about the most negative words?
most_negative <- jane_austen_sentiment %>%
  filter(book == "Pride & Prejudice") %>%
  arrange(value) %>%
  head(10)
most_negative

In [None]:
%%R
# Summarize to 80 line chunks, summing the sentiment scores
jane_austen_sentiment_summarized <- jane_austen_sentiment %>% 
  group_by(book, index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value))
jane_austen_sentiment_summarized


In [None]:
%%R
# Now we can plot sentiment score chronologically across the novels
ggplot(jane_austen_sentiment_summarized, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")
