# Topic modeling notebook

This notebook details the steps taken to clean the data and run topic modeling

In [1]:
import gensim.corpora as corpora
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem.snowball import SwedishStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Import data

In [2]:
df_full = pd.read_csv("../dataset/lawline_data.csv")

Make new column with lowercase texts and remove all whitespace plus tabs/newlines

In [3]:
# Get only questions(::2)/answers(1::2)
df = df_full.iloc[::2, :]

In [4]:
df["text_clean"] = df["text"].str.lower()
df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]


Remove punctuation

In [5]:
df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")

  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")


Remove numbers and replace more than one whitespace with single whitespace

In [6]:
df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")

  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")


In [7]:
# Removing stopwords function
def remove_stopwords(texts, stop_words):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]

Turn texts to list of words and remove stop words. Then turn texts to term document frequency corpus.

In [8]:
# Turn texts to list
list_texts = df["text_clean"].values.tolist()

In [9]:
stop_words = stopwords.words("swedish")
stop_words.extend([
    "hej", "ska", "in", "vill", "alltså", "lawline", "även",
    "kommer", "fråga", "finns", 'gör', 'får', 'få', 'gälla', 'gäller'
    ])

text_words = remove_stopwords(list_texts, stop_words)

Stem words

In [10]:
stemmer = SwedishStemmer()

text_stemmed = [[stemmer.stem(word) for word in doc] for doc in text_words]

In [11]:
docs_stemmed = [" ".join(doc) for doc in text_stemmed]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.2, max_df=0.8,
                     stop_words=stopwords.words("swedish"))

trunc_texts = cv.fit_transform(docs_stemmed)

Train LDA model on corpus with differing number of topics

In [13]:
# n_components = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]
n_components = [3, 6, 10, 20]

grid = {
    'doc_topic_prior': [.1, .5, 1, 5, 10],
    'topic_word_prior': [.1, .5, 1, 5, 10],
    'learning_decay': [.5, .7, 1]
}

lda_list = []
for n_topics in n_components:

    mod = LDA(
        max_iter=10,
        learning_method="online",
        random_state=42,
        batch_size=1024,
        evaluate_every=-1,
        n_components=n_topics,
    )

    clf = GridSearchCV(mod, param_grid=grid, n_jobs=4, verbose=1)

    clf.fit(trunc_texts)

    lda_list.append(clf)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
Fitting 5 folds for each of 75 candidates, totalling 375 fits
Fitting 5 folds for each of 75 candidates, totalling 375 fits
Fitting 5 folds for each of 75 candidates, totalling 375 fits


In [26]:
model = lda_list[2].best_estimator_

In [27]:
model.fit(trunc_texts)

In [None]:
perp_list = [mod.best_estimator_.fit(trunc_texts).perplexity(trunc_texts) for mod in lda_list]

In [None]:
perp_list

In [None]:
model = lda_list[1]

In [35]:
model.fit_transform(trunc_texts)

array([[0.53279297, 0.22178771, 0.24541932],
       [0.32000544, 0.31194701, 0.36804755],
       [0.09642113, 0.64713961, 0.25643926],
       ...,
       [0.22011287, 0.71418488, 0.06570225],
       [0.33333333, 0.33333333, 0.33333333],
       [0.1666755 , 0.66656293, 0.16676158]])

In [28]:
topicnames = ["Topic" + str(i) for i in range(model.n_components)]
df_topic_keywords = pd.DataFrame(model.components_)
df_topic_keywords.index = topicnames

In [29]:
df_topic_keywords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Topic0,0.100019,0.100015,104.236758,12.726619,19.940615,1.943137,0.100018,0.105222,0.100015,0.10002,...,23196.189178,0.100011,0.100015,0.100045,1.397951,156.277246,33565.346451,0.100015,72.403448,33803.009452
Topic1,34.186603,0.100016,61.099744,14.085373,188.368019,27.463599,0.100014,1.271485,0.100014,0.100017,...,0.101685,0.100009,69098.553843,0.100114,0.101828,398.795511,0.100013,0.100014,973.367142,0.100011
Topic2,5614.953713,0.100014,1786.826582,86.480079,27.874178,38430.969475,0.100014,0.101899,0.100015,0.100017,...,4.903953,36403.127333,0.100013,4.367957,51.85748,3411.97379,0.100013,0.100015,3895.642625,0.100013
Topic3,0.100017,0.100013,0.100034,5.237498,0.100016,0.100012,18962.058194,0.100106,0.100012,0.100017,...,0.100012,0.100009,0.100012,42441.261124,0.100016,14072.341981,0.100012,0.100012,12174.533678,0.100011
Topic4,0.100017,0.100013,0.146352,0.158322,0.100031,0.100124,0.100012,0.100032,0.100013,24191.28123,...,0.100012,0.100009,0.100012,0.264527,0.100015,19106.611366,0.100012,25927.791093,18539.523307,0.100011
Topic5,157.27937,40277.197819,32576.939362,26579.872937,30.453648,0.402921,0.100014,2.831043,0.100014,0.100018,...,1.866486,0.100009,0.100014,0.100017,0.100021,68.783417,0.100012,0.100014,54.132683,0.100011
Topic6,2405.74383,0.100014,183.903821,1.203633,49716.573335,5.650129,0.100013,0.100223,0.100014,0.100015,...,2.502446,0.10001,0.100013,0.551088,0.100015,885.574308,0.100011,0.100014,3855.657273,0.100011
Topic7,0.177818,0.100014,27.577439,9.248042,0.130667,0.977154,0.100014,0.127325,0.100014,0.100018,...,0.156108,0.100011,0.100014,0.100016,0.100021,90.631398,0.100014,0.100014,210.47152,0.100012
Topic8,236.217996,0.100017,26.223709,13.432666,0.207886,0.145873,0.100016,45901.309441,50431.697687,0.10003,...,0.100221,0.100011,0.100016,0.122565,5.285947,279.256508,0.100014,0.100014,396.317876,0.100013
Topic9,26007.967317,0.100013,29.138674,0.319391,131.597994,0.103249,0.100013,0.100013,0.100014,0.100016,...,0.100013,0.10001,0.100013,0.100015,27834.573886,42.041469,0.100014,0.100014,30.265686,0.100011


In [30]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords


topic_keywords = show_topics(vectorizer=cv, lda_model=model, n_words=15)

In [31]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = [
    "Word " + str(i) for i in range(df_topic_keywords.shape[1])
]
df_topic_keywords.index = ["Topic " + str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,år,tid,möj,tack,dock,vänd,gäll,enl,eftersom,fall,säg,få,måst,svar,fått
Topic 1,rätt,innebär,vänd,tack,enl,dock,andr,fall,gäll,eftersom,få,hopp,säg,möj,genom
Topic 2,fall,person,andr,vänd,tack,dock,eftersom,säg,innebär,enl,måst,hopp,möj,svar,genom
Topic 3,svar,hopp,fick,tack,vänd,eftersom,få,gäll,dock,fått,andr,enl,säg,gör,genom
Topic 4,vän,fått,hälsning,tack,vänd,hopp,svar,eftersom,dock,fall,gäll,få,enl,andr,säg
Topic 5,del,dock,genom,eftersom,andr,tack,vänd,enl,få,möj,innebär,fall,hopp,gäll,säg
Topic 6,kap,enl,vänd,andr,tack,dock,genom,gäll,hopp,innebär,fall,möj,eftersom,svar,måst
Topic 7,gör,måst,kräv,vänd,tack,dock,eftersom,fall,genom,andr,möj,enl,få,gäll,säg
Topic 8,får,få,vänd,tack,andr,dock,eftersom,säg,måst,gäll,gör,enl,fall,svar,innebär
Topic 9,gäll,lag,säg,andr,enl,tack,vänd,dock,eftersom,måst,fall,fått,svar,gör,hopp
