# References
- [Jordan Boyd-Graber Course](https://www.youtube.com/watch?v=fCmIceNqVog)
- [Gensim LDA Tutorial](https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html#sphx-glr-auto-examples-tutorials-run-lda-py)

# Dataset

In [None]:
import pandas as pd

df = pd.read_csv("reddit_data.csv", index_col=[0]).drop(columns=["tag", "id"])
df = df[~df["text"].isnull()]
df

# LDA with nltk and gensim

In [None]:
from gensim.parsing.preprocessing import STOPWORDS

CUSTOM_SW = [w.lower() for w in ["_NUMBER_", "_URL_", "_USER_", "_EMOJI_"]]

In [None]:
import nltk

nltk.download("omw-1.4")

In [None]:
docs = df.text.to_list()
stoplist = list(STOPWORDS) + CUSTOM_SW

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r"\w+")
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove stopwords
docs = [[token for token in doc if token not in stoplist] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [None]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if "_" in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 10 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print("Number of unique tokens: %d" % len(dictionary))
print("Number of documents: %d" % len(corpus))

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha="auto",
    eta="auto",
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
)

In [None]:
top_topics = model.top_topics(corpus)  # , num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print("Average topic coherence: %.4f." % avg_topic_coherence)

from pprint import pprint

pprint(top_topics)

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

vis_data = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
pyLDAvis.display(vis_data)

# Gensim (EnsembleLDA) + NLTK

In [None]:
docs = df.text.to_list()
stoplist = list(STOPWORDS) + CUSTOM_SW
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r"\w+")
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove stopwords
docs = [[token for token in doc if token not in stoplist] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if "_" in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [None]:
from gensim.corpora import Dictionary
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

dictionary = Dictionary()
for doc in docs:
    dictionary.add_documents([[lemmatizer.lemmatize(token) for token in doc]])
dictionary.filter_extremes(no_below=20, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
from gensim.models import LdaModel

topic_model_class = LdaModel

ensemble_workers = 4
num_models = 4
distance_workers = 4
num_topics = 20
passes = 8
from gensim.models import EnsembleLda

ensemble = EnsembleLda(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=passes,
    num_models=num_models,
    topic_model_class=LdaModel,
    ensemble_workers=ensemble_workers,
    distance_workers=distance_workers,
)

print(len(ensemble.ttda))
print(len(ensemble.get_topics()))

In [None]:
from pathlib import Path

path = Path("../models/topics/lda_10topics/2022/03/10/")
path.mkdir(parents=True, exist_ok=True)
model.save(str(path / "model"))

In [None]:
!mkdir -p ../models/topics/lda/2022/03/10/

In [None]:
from gensim.models.ldamodel import LdaModel

model_loaded = LdaModel.load(str(path / "model"))

# Create multiple versions of the lda model

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import Phrases
from gensim.models import LdaModel

from gensim.parsing.preprocessing import STOPWORDS

CUSTOM_SW = [
    w.lower()
    for w in ["_NUMBER_", "_URL_", "_USER_", "_EMOJI_", "ve", "use", "like", "work"]
]


def get_model_name(num_topics, use_bigrams, bigrams_min_count, no_below, no_above):
    return f"lda_{num_topics}topics_{no_below}_{no_above}_{'bi'+str(bigrams_min_count) if use_bigrams else 'nobi'}"


def train_lda_model(
    num_topics=10, use_bigrams=False, bigrams_min_count=20, no_below=10, no_above=0.5
):
    docs = df.text.to_list()
    stoplist = list(STOPWORDS) + CUSTOM_SW

    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r"\w+")
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    # Remove stopwords
    docs = [[token for token in doc if token not in stoplist] for doc in docs]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]

    # Lemmatize the documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    # Compute bigrams.

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    if use_bigrams:
        bigram = Phrases(docs, min_count=bigrams_min_count)
        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if "_" in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)

    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 10 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    # Set training parameters.
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    return (
        LdaModel(
            corpus=corpus,
            id2word=dictionary,
            chunksize=chunksize,
            alpha="auto",
            eta="auto",
            iterations=iterations,
            num_topics=num_topics,
            passes=passes,
            eval_every=eval_every,
        ),
        corpus,
        dictionary,
    )


ROOT_DIR = Path("..")
import datetime


def save_model(name, model, corpus, dictionary):
    date_folder = datetime.date.today().strftime("%Y/%m/%d")
    target_folder = ROOT_DIR / "models" / "topics" / name / date_folder
    target_folder.mkdir(parents=True, exist_ok=True)
    model.save(str(target_folder / "model"))
    dictionary.save(str(target_folder / "dictionary"))
    MmCorpus.serialize(str(target_folder / "corpus"), corpus)


def load_model(name):
    model_type = "topics"
    config_name = name
    model_folder = ROOT_DIR / "models" / model_type / config_name
    date = ""
    for _ in ["year", "month", "day"]:
        date += "/" + max(x.name for x in model_folder.iterdir() if x.is_dir())
    model_folder = model_folder / date
    model = LdaModel.load(str(model_folder / "model"))
    dictionary = Dictionary.load(str(model_folder / "dictionary"))
    corpus = MmCorpus(str(model_folder / "corpus"))
    return (
        model,
        dictionary,
        corpus,
    )

In [None]:
num_topicss = [5, 10, 20, 50]
use_bigramss = [True, False]
bigrams_min_counts = [5, 20]
no_belows = [5, 10, 20, 50]
no_aboves = [0.5, 0.99]

from itertools import product

for t in product(num_topicss, use_bigramss, bigrams_min_counts, no_belows, no_aboves):
    num_topics, use_bigrams, bigrams_min_count, no_below, no_above = t
    name = get_model_name(
        num_topics, use_bigrams, bigrams_min_count, no_below, no_above
    )
    print(name)
    model, corpus, dictionary = train_lda_model(
        num_topics, use_bigrams, bigrams_min_count, no_below, no_above
    )
    save_model(name, model, corpus, dictionary)

In [None]:
bow = dictionary.doc2bow(["looking", "dataset", "pets", "features", "columns"])
model.get_document_topics(bow)