# 1. Install and load all necessary packages

In [1]:
# Usually, we would install packages like this - but this takes forever on Colab notebooks (at least 15 min.)
# install.packages("dplyr")
# install.packages("quanteda")
# install.packages("RCurl")
# install.packages("quanteda.textplots")
# install.packages("quanteda.textstats")
# install.packages("udpipe")

# For this session, we therefore "only" load an already compiled, zipped file with all R packages
# This speeds up the installation process (but is a "turnaround")

# create a folder called "library"
system("mkdir library")

# download the R environment file containing complied packages
R_environment_file <- "https://drive.usercontent.google.com/download?id=1vmeZC68FTNNyanEl3c6DRWOvjumE1RMu&export=download&authuser=0&confirm=t&uuid=843db779-c069-4811-a326-2a9847eb1bbc&at=APZUnTUXVHG1k7a4evb5pcCJ3XQc:1717563767799"
download.file(R_environment_file, destfile="./library.tar.gz")

# unzip the compressed R library file: 'library.tar.gz' into the R library folder
untar("library.tar.gz", "library")

# change the R library directory into './library'
.libPaths("library")

In [None]:
# We activate relevant packages
library("dplyr")
library("quanteda")
library("RCurl")
library("quanteda.textplots")
library("quanteda.textstats")
library("udpipe")

# 2. Getting Text into R/Python


In [3]:
# We load data (a csv-file with ratings and content of TV series) from the Github repository
url = getURL("https://raw.githubusercontent.com/valeriehase/Salamanca-CSS-SummerSchool/main/Processing%20text%20and%20text%20as%20data/data_tvseries.csv")
data = read.csv2(text = url)

In [None]:
#Check data by inspecting first rows via head()
head(data)

In [None]:
# Inspect weird data in variable "Year" for first observation
data %>%
  select(Year) %>%
  slice(1)

# 3. Cleaning/Normalizing Text



## Cleaning Text via Regular Expressions



In [None]:
#Let's remove the number, point and blank space before the TV series in our
#variable "Title" using gsub()
data = data %>%
  mutate(Title = gsub("^[0-9]+[[:punct:]] ", "", Title))

#Inspect the first five rows of the resulting data frame
data %>%
  head(5)

In [None]:
# Ok, let's have some fun with this.
# Using the grepl() function, we find all TV series that contain the word "drama" in the variable "Description".
# We use filter() to identify these observations.
data %>%

  #filter all observations containing the word "drama"
  filter(grepl("[D|d]rama", Description)) %>%

  # see first 5 rows of data set
  head(5)

In [None]:
#Let's get all observations that contain the word
# "drama" or the word "crime" in the variable "Description"
data %>%

  #filter all observations containing the word "drama"
  filter(grepl("[D|d]rama|[C|c]rime", Description)) %>%

  # see first rows of data set
  head(5)

In [None]:
# Your turn!
# Can you identify all series that play in Spain?

In [None]:
# Your turn!
# Can you identify all series that deal with superheroes
# and replace the term "superhero/superheroes in the variable "Description"
# with "fancy R programmers"?

## Normalizing text



In [10]:
#Run preprocessing steps using tokens() and subfunctions
tokens <- tokens(data$Description,
                 what = "word",
                 remove_punct = TRUE,
                 remove_numbers = TRUE) %>%
          tokens_tolower() %>%
          tokens_remove(stopwords("english")) %>%
          tokens_wordstem()

In [None]:
#Look at original first text
data$Description[1]

In [None]:
#Look at preprocessed first text
tokens[1]

In [12]:
# Your turn!
# Can you create a list of 3-5 stop words that you think are unique to this corpus
# and remove these as part of the existing preprocessing pipeline?

# Text-as-Data Representations



## Bag-of-words approach: Document-feature matrix

In [None]:
#Create a document-feature matrix
dfm = tokens %>%
  dfm()

#check result
dfm

In [None]:
#Check most frequent features
topfeatures = topfeatures(dfm, 10) %>%
  as.data.frame() %>%
  rename("count" = '.')

topfeatures

In [None]:
#Visualize results with a word cloud
textplot_wordcloud(dfm, max_words = 100)

## Beyond bag-of-words: Ngrams

In [None]:
# Get most frequent collocations
tokens %>%
  textstat_collocations(min_count = 10) %>%
  arrange(-lambda) %>%
  head(10)

## Beyond bag-of-words: Part-of-speech tagging

In [None]:
data$Description %>%

#change format for udpipe package
  as_tibble() %>%
  mutate(doc_id = paste0("text", 1:n())) %>%
  rename(text = value) %>%

  #for simplicity, run for fewer documents
  slice(1) %>%

  #part-of-speech tagging, include only related variables
  udpipe("english") %>%
  select(doc_id, sentence_id, token_id, token, upos) %>%
  head(10)