# Topic modeling notebook

This notebook details the steps taken to clean the data and run topic modeling

In [30]:
import gensim.corpora as corpora
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem.snowball import SwedishStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Import data

In [31]:
df_full = pd.read_csv("../dataset/lawline_data.csv")

Make new column with lowercase texts and remove all whitespace plus tabs/newlines

In [32]:
# Get only questions/answers
df = df_full.iloc[::2, :]

In [33]:
df["text_clean"] = df["text"].str.lower()
df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df['text'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = [' '.join(str(item).split()) for item in df['text_clean']]


Remove punctuation

In [34]:
df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")

  df['text_clean'] = df['text_clean'].str.replace('[^\w\s]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df['text_clean'].str.replace('[^\w\s]', '')


Remove numbers and replace more than one whitespace with single whitespace

In [35]:
df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")

  df['text_clean'] = df['text_clean'].str.replace('[0-9]', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df['text_clean'].str.replace('[0-9]', '')
  df['text_clean'] = df['text_clean'].str.replace('\s{2,}', ' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df['text_clean'].str.replace('\s{2,}', ' ')


In [36]:
# Removing stopwords function
def remove_stopwords(texts, stop_words):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]

Turn texts to list of words and remove stop words. Then turn texts to term document frequency corpus.

In [37]:
# Turn texts to list
list_texts = df["text_clean"].values.tolist()

In [38]:
stop_words = stopwords.words("swedish")
stop_words.extend(["hej", "ska", "in", "vill", "alltså", "lawline", "även"])

text_words = remove_stopwords(list_texts, stop_words)

Stem words

In [39]:
stemmer = SwedishStemmer()

text_stemmed = [[stemmer.stem(word) for word in doc] for doc in text_words]

In [40]:
docs_stemmed = [" ".join(doc) for doc in text_stemmed]

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.15, max_df=0.85, stop_words=stopwords.words("swedish"))

trunc_texts = cv.fit_transform(docs_stemmed)

Train LDA model on corpus with differing number of topics

In [42]:
# First take Grid Search approach
grid = {
    #'doc_topic_prior': [.05, .1, .5, 1, 5, 10],
    #'topic_word_prior': [.05, .1, .5, 1, 5, 10],
    "n_components": [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20],
    #'learning_decay': [.5, .7, 1]
}

mod = LDA(
    max_iter=10,
    learning_method="online",
    random_state=42,
    batch_size=128,
    evaluate_every=-1,
)

# clf = RandomizedSearchCV(mod, param_distributions=grid, n_jobs=-1,
#                          n_iter=10, verbose=1)

clf = GridSearchCV(mod, param_grid=grid, n_jobs=-1, verbose=1)

clf.fit(trunc_texts)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


KeyboardInterrupt: 

In [23]:
model = clf.best_estimator_

In [12]:
lda_list = []
num_topics = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]

for num_topic in num_topics:
    mod = LDA(
        n_components=num_topic,
        max_iter=10,
        learning_method="online",
        random_state=42,
        batch_size=128,
        evaluate_every=-1,
        n_jobs=-1,
    )

    mod.fit(trunc_texts)

    lda_list.append(mod)

KeyboardInterrupt: 

In [None]:
perp_list = [mod.perplexity(trunc_texts) for mod in lda_list]

In [None]:
perp_list

In [None]:
model = lda_list[1]

In [24]:
model.fit_transform(trunc_texts)

array([[0.25756578, 0.06250098, 0.23794881, ..., 0.1274021 , 0.06429844,
        0.12503265],
       [0.02619643, 0.11613501, 0.04569761, ..., 0.10614123, 0.12610509,
        0.35032662],
       [0.06019153, 0.05267255, 0.36153796, ..., 0.15968172, 0.0563115 ,
        0.10621732],
       ...,
       [0.03644769, 0.20400431, 0.07238093, ..., 0.34555503, 0.06999842,
        0.08215634],
       [0.10021088, 0.20131982, 0.10002629, ..., 0.10033306, 0.195307  ,
        0.10034966],
       [0.07692671, 0.08426497, 0.07692308, ..., 0.17045307, 0.22731104,
        0.08578097]])

In [25]:
topicnames = ["Topic" + str(i) for i in range(model.n_components)]
df_topic_keywords = pd.DataFrame(model.components_)
df_topic_keywords.index = topicnames

In [26]:
df_topic_keywords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
Topic0,154.187145,1.510541,0.100016,11.215323,0.100017,0.100019,19.361503,590.095924,0.100022,5.318551,...,3.327713,0.100021,30.095829,66158.5576,40855.9507,0.449685,2.557207,0.22881,15159.559355,69071.072458
Topic1,10224.87187,983.917846,0.10001,822.665069,0.100016,0.100017,37458.633086,5179.528547,0.750411,72425.57974,...,328.957354,0.100021,367.533959,0.103667,0.10002,0.100019,30.563796,314.032476,29058.007533,0.100014
Topic2,61.804112,0.424129,0.100013,35211.559621,60797.470774,0.100019,10.506632,268.111232,0.10002,0.108266,...,56446.29768,41283.548772,20.037134,0.100052,0.10002,80919.916001,0.104183,0.289335,1906.759769,0.100016
Topic3,15359.187962,6783.401664,0.100014,726.276421,0.100015,0.10002,7340.134007,243.438632,13.20241,5193.692136,...,2.079978,0.11044,2893.18771,0.100018,0.100021,0.100019,1732.667639,3587.297099,11389.679934,0.100016
Topic4,8000.120016,59313.700731,86051.223148,42.645277,0.100015,5.163783,1415.506107,264.452796,0.100021,74.800024,...,0.13324,0.10881,319.247714,0.100019,398.722536,44.485155,12.834929,47.341063,4887.889734,0.100016
Topic5,8114.172843,251.543519,0.100018,1352.34931,0.100018,78124.674068,2856.892611,42149.828511,0.100021,249.106843,...,0.100558,15.111134,180.589356,0.100019,1.125788,0.179454,6.601214,72.116352,4703.670986,0.100015
Topic6,3018.957502,0.1036,0.100011,55.889715,0.100014,0.100016,114.682931,65.912295,0.10002,5.568021,...,0.100019,0.100019,59326.013907,0.100017,0.100017,0.100016,62510.29852,59660.653014,101.808016,0.100012
Topic7,22142.931607,105.851756,0.100016,123.407842,0.100017,0.100022,24083.96823,922.121789,107841.20721,1251.643189,...,22.927837,0.115184,1117.022938,0.139208,0.100026,0.100021,124.109183,497.924494,8769.992107,0.100015


In [27]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords


topic_keywords = show_topics(vectorizer=cv, lda_model=model, n_words=15)

In [28]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = [
    "Word " + str(i) for i in range(df_topic_keywords.shape[1])
]
df_topic_keywords.index = ["Topic " + str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,gör,år,tid,två,även,fick,finn,fått,måst,eftersom,får,fråg,svar,möj,alltså
Topic 1,gäll,fall,lag,kräv,finn,kunn,dock,måst,även,innebär,alltså,eftersom,möj,fråg,andr
Topic 2,vill,betal,säg,ta,behöv,fått,måst,fick,får,även,svar,eftersom,finn,gör,få
Topic 3,kap,person,se,finn,innebär,alltså,även,genom,dock,andr,fall,måst,får,fråg,vänd
Topic 4,får,barn,andr,först,går,finn,alltså,även,innebär,dock,fråg,fått,svar,två,tack
Topic 5,komm,få,del,eftersom,finn,innebär,alltså,även,dock,fått,möj,behöv,svar,fråg,genom
Topic 6,fråg,svar,vän,vänd,tack,lawlin,hälsning,hopp,fick,fått,alltså,finn,innebär,dock,även
Topic 7,rätt,enl,genom,möj,dock,alltså,innebär,även,finn,se,får,fall,tack,eftersom,fråg
