# Topic modeling notebook

This notebook details the steps taken to clean the data and run topic modeling

In [10]:
import pickle
from datetime import datetime

import numpy as np
import pandas as pd
import textstat
from gensim.utils import simple_preprocess
from lexical_diversity import lex_div as ld
from nltk.corpus import stopwords
from nltk.stem.snowball import SwedishStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV

Import data

In [2]:
df_full = pd.read_csv("../dataset/lawline_data.csv")

Make new column with lowercase texts and remove all whitespace plus tabs/newlines

In [3]:
# Get only questions(::2)/answers(1::2)
df = df_full.iloc[::2, :]

In [4]:
df["text_clean"] = df["text"].str.lower()
df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]


Remove punctuation

In [5]:
df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")

  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")


Remove numbers and replace more than one whitespace with single whitespace

In [6]:
df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")

  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")


In [7]:
# Removing stopwords function
def remove_stopwords(texts, stop_words):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]

Turn texts to list of words and remove stop words. Then turn texts to term document frequency corpus.

In [8]:
# Turn texts to list
list_texts = df["text_clean"].values.tolist()

In [9]:
stop_words = stopwords.words("swedish")
stop_words.extend(
    [
        "hej",
        "ska",
        "in",
        "vill",
        "alltså",
        "lawline",
        "även",
        "kommer",
        "fråga",
        "finns",
        "gör",
        "får",
        "få",
        "gäll",
        "tack",
        "dock",
        "kap",
        "fall",
        "säg",
        "svar",
        "rätt",
        "enl",
        "måst",
    ]
)

text_words = remove_stopwords(list_texts, stop_words)

Stem words

In [11]:
stemmer = SwedishStemmer()

text_stemmed = [[stemmer.stem(word) for word in doc] for doc in text_words]

In [12]:
docs_stemmed = [" ".join(doc) for doc in text_stemmed]

In [13]:
cv = CountVectorizer(min_df=0.05, max_df=0.7, stop_words=stop_words)

trunc_texts = cv.fit_transform(docs_stemmed)

Train LDA model on corpus with differing number of topics

In [15]:
# n_components = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]
n_components = [6]
grid = {
    "doc_topic_prior": [0.1, 0.5, 1, 5, 10],
    "topic_word_prior": [0.1, 0.5, 1, 5, 10],
    "learning_decay": [0.5, 0.7, 1],
}

lda_list = []
for n_topics in n_components:

    mod = LDA(
        max_iter=10,
        learning_method="online",
        random_state=42,
        batch_size=128,
        evaluate_every=-1,
        n_components=n_topics,
    )

    clf = GridSearchCV(mod, param_grid=grid, n_jobs=6, verbose=1)

    clf.fit(trunc_texts)

    lda_list.append(clf)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


In [16]:
model = lda_list[0].best_estimator_

In [17]:
model.fit(trunc_texts)

In [40]:
def show_topics(vectorizer, lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [63]:
topic_keywords = show_topics(vectorizer=cv, lda_model=model, n_words=10)

In [68]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = [
    "Word " + str(i) for i in range(df_topic_keywords.shape[1])
]
df_topic_keywords.index = ["Topic " + str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.transpose()

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
Word 0,person,fast,avtal,barn,betal,lägen
Word 1,brott,sälj,lag,egendom,arbetsgiv,del
Word 2,beslut,köp,vänd,mak,år,se
Word 3,domstol,bil,innebär,bodelning,peng,använd
Word 4,dom,fel,avtalet,barnet,skuld,skad
Word 5,uppgift,huset,vän,del,tid,bost
Word 6,brottet,köpt,kräv,testament,kr,andr
Word 7,vänd,gåv,tid,arv,tillbak,ta
Word 8,polis,kräv,hälsning,ärv,dag,flytt
Word 9,andr,kunn,möj,enskild,månad,ekonomisk


Save model with pickle

In [18]:
model_name = "../models/lda_q_6.pkl"
with open(model_name, "wb") as f:
    pickle.dump(model, f)

In [19]:
model_name = "../models/lda_q_6.pkl"
with open(model_name, "rb") as f:
    model = pickle.load(f)

Get topic distribution, downsample to monthly average and save

In [20]:
topic_dists = model.transform(trunc_texts)

In [21]:
topic_dists[0]

array([0.0079386 , 0.00793745, 0.00793973, 0.00793797, 0.49679505,
       0.4714512 ])

In [88]:
df_topic_q = pd.DataFrame(topic_dists, columns=[f"topic_{i}" for i in range(6)])

In [89]:
from datetime import datetime

df_topic_q["date"] = [
    datetime.strptime(date_str, "%d/%m/%Y") for date_str in df["date"].values
]

In [90]:
df_topic_q.set_index("date", drop=True, inplace=True)

In [91]:
df_topic_q_down = df_topic_q.resample("M").mean()

In [93]:
df_topic_q_down.to_csv("../dataset/topic_q_downsampled.csv")