# Topic modeling notebook

This notebook details the steps taken to clean the data and run topic modeling

In [None]:
import pickle
from datetime import datetime

import numpy as np
import pandas as pd
import textstat
from gensim.utils import simple_preprocess
from lexical_diversity import lex_div as ld
from nltk.corpus import stopwords
from nltk.stem.snowball import SwedishStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

Import data

In [None]:
df_full = pd.read_csv("../dataset/lawline_data.csv")

Make new column with lowercase texts and remove all whitespace plus tabs/newlines

In [None]:
# Get only questions(::2)/answers(1::2)
df = df_full.iloc[::2, :]

In [None]:
df["text_clean"] = df["text"].str.lower()
df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]

Remove punctuation

In [None]:
df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")

Remove numbers and replace more than one whitespace with single whitespace

In [None]:
df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")

In [None]:
# Removing stopwords function
def remove_stopwords(texts, stop_words):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]

Turn texts to list of words and remove stop words. Then turn texts to term document frequency corpus.

In [None]:
# Turn texts to list
list_texts = df["text_clean"].values.tolist()

In [None]:
stop_words = stopwords.words("swedish")
stop_words.extend(
    [
        "hej",
        "ska",
        "in",
        "vill",
        "alltså",
        "lawline",
        "även",
        "kommer",
        "fråga",
        "finns",
        "gör",
        "får",
        "få",
        "gäll",
        "tack",
        "dock",
        "kap",
        "fall",
        "säg",
        "svar",
        "rätt",
        "enl",
        "måst",
    ]
)

text_words = remove_stopwords(list_texts, stop_words)

Stem words

In [None]:
stemmer = SwedishStemmer()

text_stemmed = [[stemmer.stem(word) for word in doc] for doc in text_words]

In [None]:
docs_stemmed = [" ".join(doc) for doc in text_stemmed]

In [None]:
cv = CountVectorizer(min_df=0.05, max_df=0.7, stop_words=stop_words)

trunc_texts = cv.fit_transform(docs_stemmed)

Train LDA model on corpus with differing number of topics

In [None]:
# n_components = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]
n_components = [6]
grid = {
    "doc_topic_prior": [0.1, 0.5, 1, 5, 10],
    "topic_word_prior": [0.1, 0.5, 1, 5, 10],
    "learning_decay": [0.5, 0.7, 1],
}

lda_list = []
for n_topics in n_components:

    mod = LDA(
        max_iter=10,
        learning_method="online",
        random_state=42,
        batch_size=128,
        evaluate_every=-1,
        n_components=n_topics,
    )

    clf = GridSearchCV(mod, param_grid=grid, n_jobs=6, verbose=1)

    clf.fit(trunc_texts)

    lda_list.append(clf)

In [None]:
model = lda_list[0].best_estimator_

In [None]:
model.fit(trunc_texts)

In [None]:
def show_topics(vectorizer, lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [None]:
topic_keywords = show_topics(vectorizer=cv, lda_model=model, n_words=10)

In [None]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = [
    "Word " + str(i) for i in range(df_topic_keywords.shape[1])
]
df_topic_keywords.index = ["Topic " + str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.transpose()

Save model with pickle

In [None]:
model_name = "../models/lda_q_6.pkl"
with open(model_name, "wb") as f:
    pickle.dump(model, f)

In [None]:
model_name = "../models/lda_q_6.pkl"
with open(model_name, "rb") as f:
    model = pickle.load(f)

Get topic distribution, downsample to monthly average and save

In [None]:
topic_dists = model.transform(trunc_texts)

In [None]:
topic_dists[0]

In [None]:
df_topic_q = pd.DataFrame(topic_dists, columns=[f"topic_{i}" for i in range(6)])

In [None]:
from datetime import datetime

df_topic_q["date"] = [
    datetime.strptime(date_str, "%d/%m/%Y") for date_str in df["date"].values
]

In [None]:
df_topic_q.set_index("date", drop=True, inplace=True)

In [None]:
df_topic_q_down = df_topic_q.resample("M").mean()

In [None]:
df_topic_q_down.to_csv("../dataset/topic_q_downsampled.csv")