# 1. Install and load all necessary packages

In [None]:
# Install relevant packages
install.packages("dplyr")
install.packages("RCurl")
install.packages("quanteda")
install.packages("stm")
install.packages("reshape2")
install.packages("ggplot2")

In [None]:
# We activate relevant packages
library("dplyr")
library("RCurl")
library("quanteda")
library("stm")
library("reshape2")
library("ggplot2")

# 2. Load & Preprocess data


In [5]:
# We load data (a csv-file with ratings and content of TV series) from the Github repository
url = getURL("https://raw.githubusercontent.com/valeriehase/Salamanca-CSS-SummerSchool/main/Topic%20modeling/data_tvseries.csv")
data = read.csv2(text = url)

In [None]:
#Check data by inspecting first rows via head()
head(data)

In [None]:
#Run preprocessing steps
tokens <- tokens(data$Description,
                 what = "word",
                 remove_punct = TRUE,
                 remove_numbers = TRUE) %>%
          tokens_tolower() %>%
          tokens_remove(stopwords("english")) %>%
          tokens_wordstem()

#Additional steps: apply relative pruning
dfm <- dfm_trim(dfm(tokens), min_docfreq = 0.005, max_docfreq = 0.99,
                docfreq_type = "prop", verbose = TRUE)

In [None]:
#Check result
dfm

# 3. Deciding on Model Parameters, here: K number of topics


In [10]:
#Transform to right format for stm package
dfm_stm <- convert(dfm, to = "stm")

## Statistical Fit


In [None]:
K <- c(4,6)
fit <- searchK(dfm_stm$documents, dfm_stm$vocab, K = K, verbose = TRUE)

# Create graph
plot <- data.frame("K" = K,
                   "Coherence" = unlist(fit$results$semcoh),
                   "Perplexity" = unlist(fit$results$heldout))

# Reshape to long format
plot <- melt(plot, id = c("K"))
# Create graph
plot <- data.frame("K" = K,
                   "Coherence" = unlist(fit$results$semcoh),
                   "Perplexity" = unlist(fit$results$heldout))

# Reshape to long format
plot <- melt(plot, id = c("K"))

#Plot result
ggplot(plot, aes(K, value, color = variable)) +
  geom_line(linewidth = 1.5, show.legend = FALSE) +
  scale_x_continuous(breaks = c(4, 6)) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(x = "Number of topics K",
       title = "Statistical fit of models with different K")

## Interpretability


In [None]:
model_4K <- stm(documents = dfm_stm$documents,
         vocab = dfm_stm$vocab,
         K = 4)

model_6K <- stm(documents = dfm_stm$documents,
         vocab = dfm_stm$vocab,
         K = 6)

#### Top Words


In [None]:
#for K = 4
topics_4 <- labelTopics(model_4K, n=10)
topics_4 <- data.frame("features" = t(topics_4$frex))
colnames(topics_4) <- paste("Topics", c(1:4))
topics_4

In [None]:
#for K = 6
topics_6 <- labelTopics(model_6K, n=10)
topics_6 <- data.frame("features" = t(topics_6$frex))
colnames(topics_6) <- paste("Topics", c(1:6))
topics_6

#### Top Documents


In [None]:
findThoughts(model_4K, data$Description, topics = 2 , n = 1)

# Running the final model


In [None]:
# Run the model you decided on
model <- stm(documents = dfm_stm$documents,
         vocab = dfm_stm$vocab,
         K = 4)

## Check top words

In [None]:
#Save top 20 features across topics and forms of weighting
labels <- labelTopics(model, n = 15)

#only keep FREX weighting
topwords <- data.frame("features" = t(labels$frex))

#assign topic number as column name
colnames(topwords) <- paste("Topics", c(1:4))

#Return the result
topwords[1:4]

## Check top documents per topic

In [None]:
# Create theta matrix
theta <- make.dt(model)

#Get most representative topics for topic 1
theta %>%
  arrange(desc(Topic1)) %>%
  head()

In [None]:
# Check related topic
data$Description[345]

## Visualize topic proportions

In [None]:
plot(model)