# 1. Install and load all necessary packages

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim import corpora
from gensim import matutils
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# Ensure you have the necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# 2. Load & Preprocess data


In [2]:
# We load data (a csv-file with ratings and content of TV series) from the Github repository
url = "https://raw.githubusercontent.com/valeriehase/Salamanca-CSS-SummerSchool/main/Processing%20text%20and%20text%20as%20data/data_tvseries.csv"
data = pd.read_csv(url, sep = ";")

In [None]:
#Check data by inspecting first rows via head()
data.head()

In [4]:
# Initialize the stop words and stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

#Preprocess
def clean_description_dfm(description):
    # Tokenize the description
    words = word_tokenize(description)
    # Remove special signs and convert to lower case
    words = [word.lower() for word in words if word.isalpha()]
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    #Additionally re-join as string
    return ' '.join(words)  # Join the tokens back into a single string

tokens_dfm = [clean_description_dfm(description) for description in data["Description"]]

#Create a document-feature matrix, with relative pruning
vectorizer = CountVectorizer(min_df = 0.004, max_df = .99)
dfm = vectorizer.fit_transform(tokens_dfm)

#Check result
pd.DataFrame(dfm.todense(), columns = vectorizer.get_feature_names_out()).head()

# 3. Deciding on Model Parameters, here: K number of topics


## Statistical Fit


In [None]:
corpus = matutils.Sparse2Corpus(dfm, documents_columns = False)
dictionary = dict(enumerate(vectorizer.get_feature_names_out()))

result = []
for k in [4,6 ]:
    m = LdaModel(
        corpus,
        num_topics = k,
        id2word = dictionary,
        random_state = 2024,
    )
    perplexity = m.log_perplexity(corpus)
    coherence = CoherenceModel(
        model = m, corpus = corpus, coherence = "u_mass"
    ).get_coherence()
    result.append(dict(k = k, perplexity = perplexity, coherence = coherence))

result = pd.DataFrame(result)
result.plot(x = "k", y=["perplexity", "coherence"])
plt.xticks([4, 6])
plt.show()

## Interpretability


In [None]:
model_4K = LdaModel(corpus, num_topics = 4, id2word = dictionary, random_state = 2024)
model_6K = LdaModel(corpus, num_topics = 6, id2word = dictionary, random_state = 2024)

#### Top Words


In [None]:
#for K = 4
pd.DataFrame(
    {
        f"Topic {n}": [w for (w, tw) in words]
        for (n, words) in model_4K.show_topics(formatted=False)
    }
)

In [None]:
#for K = 6
pd.DataFrame(
    {
        f"Topic {n}": [w for (w, tw) in words]
        for (n, words) in model_6K.show_topics(formatted=False)
    }
)

#### Top Documents


In [None]:
def get_representative_docs_for_topic(model, corpus, documents, topic_id, top_n = 5):
    """
    Extract the most representative documents for a specific topic in an LDA model.

    Parameters:
    - model: The trained LdaModel object.
    - corpus: The corpus used for training the LDA model.
    - documents: The original documents corresponding to the corpus.
    - topic_id: The topic ID for which to extract the most representative documents.
    - top_n: The number of most representative documents to extract for the topic.

    Returns:
    - representative_docs: A list of the most representative documents for the specified topic.
    """
    representative_docs = []

    # Iterate over each document in the corpus
    for doc_id, bow in enumerate(corpus):
        # Get the topic distribution for the document
        topic_distribution = model.get_document_topics(bow, minimum_probability=0)

        # Store the document's topic probability for the specified topic
        for tid, prob in topic_distribution:
            if tid == topic_id:
                representative_docs.append((doc_id, prob))

    # Sort the documents for the specified topic by probability in descending order
    representative_docs.sort(key=lambda x: x[1], reverse=True)
    # Keep only the top_n most representative documents
    representative_docs = representative_docs[:top_n]

    # Convert document indices to actual documents
    representative_docs = [documents[doc_id] for doc_id, _ in representative_docs]

    return representative_docs

# Get the most representative document for the 2nd topic (1st index, therefore topic_id = 1)
representative_docs_for_topic = get_representative_docs_for_topic(model = model_4K, corpus = corpus, documents = data["Description"], topic_id = 1, top_n = 1)

# Print a representative documents for the  topic
representative_docs_for_topic

# Running the final model


In [None]:
model = LdaModel(corpus, num_topics = 4, id2word = dictionary, random_state = 2024)

## Check top words

In [None]:
#Check top words
pd.DataFrame(
    {
        f"Topic {n}": [w for (w, tw) in words]
        for (n, words) in model_4K.show_topics(formatted=False)
    }
)

## Check top documents per topic

In [None]:
# Get the most representative document for first topic
get_representative_docs_for_topic(model = model_4K, corpus = corpus,
                                  documents = data["Description"], topic_id = 0, top_n = 1)

## Visualize topic proportions

In [None]:
# Infer topic distributions for each document
topic_distributions = [model.get_document_topics(bow, minimum_probability = 0) for bow in corpus]

# Aggregate topic proportions across the corpus
num_topics = model.num_topics
topic_proportions = np.zeros(num_topics)

for doc_topics in topic_distributions:
    for topic_id, prop in doc_topics:
        topic_proportions[topic_id] += prop

# Normalize to get proportions
topic_proportions /= len(corpus)

# Plot the topic proportions
plt.figure(figsize=(10, 6))
plt.bar(range(num_topics), topic_proportions, color='skyblue')
plt.xlabel('Topic ID')
plt.ylabel('Proportion')
plt.title('Expected Topic Proportions Across the Corpus')
plt.xticks(range(num_topics))
plt.show()