# Topic modeling notebook

This notebook details the steps taken to clean the data and run topic modeling

In [20]:
import gensim.corpora as corpora
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem.snowball import SwedishStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Import data

In [21]:
df_full = pd.read_csv("../dataset/lawline_data.csv")

Make new column with lowercase texts and remove all whitespace plus tabs/newlines

In [22]:
# Get only questions/answers
df = df_full.iloc[::2, :]

In [23]:
df["text_clean"] = df["text"].str.lower()
df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]


Remove punctuation

In [24]:
df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")

  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")


Remove numbers and replace more than one whitespace with single whitespace

In [25]:
df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")

  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")


In [26]:
# Removing stopwords function
def remove_stopwords(texts, stop_words):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]

Turn texts to list of words and remove stop words. Then turn texts to term document frequency corpus.

In [27]:
# Turn texts to list
list_texts = df["text_clean"].values.tolist()

In [28]:
stop_words = stopwords.words("swedish")
stop_words.extend([
    "hej", "ska", "in", "vill", "alltså", "lawline", "även",
    "kommer", "fråga", "finns"
    ])

text_words = remove_stopwords(list_texts, stop_words)

Stem words

In [29]:
stemmer = SwedishStemmer()

text_stemmed = [[stemmer.stem(word) for word in doc] for doc in text_words]

In [30]:
docs_stemmed = [" ".join(doc) for doc in text_stemmed]

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.2, max_df=0.8,
                     stop_words=stopwords.words("swedish"))

trunc_texts = cv.fit_transform(docs_stemmed)

Train LDA model on corpus with differing number of topics

In [32]:
# First take Grid Search approach
grid = {
    #'doc_topic_prior': [.05, .1, .5, 1, 5, 10],
    #'topic_word_prior': [.05, .1, .5, 1, 5, 10],
    "n_components": [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20],
    'learning_decay': [.5, .7, 1]
}

mod = LDA(
    max_iter=10,
    learning_method="online",
    random_state=42,
    batch_size=128,
    evaluate_every=-1,
)

# clf = RandomizedSearchCV(mod, param_distributions=grid, n_jobs=-1,
#                          n_iter=10, verbose=1)

clf = GridSearchCV(mod, param_grid=grid, n_jobs=-1, verbose=1)

clf.fit(trunc_texts)

Fitting 5 folds for each of 33 candidates, totalling 165 fits


In [34]:
model = clf.best_estimator_

In [12]:
lda_list = []
num_topics = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]

for num_topic in num_topics:
    mod = LDA(
        n_components=num_topic,
        max_iter=10,
        learning_method="online",
        random_state=42,
        batch_size=128,
        evaluate_every=-1,
        n_jobs=-1,
    )

    mod.fit(trunc_texts)

    lda_list.append(mod)

KeyboardInterrupt: 

In [None]:
perp_list = [mod.perplexity(trunc_texts) for mod in lda_list]

In [None]:
perp_list

In [None]:
model = lda_list[1]

In [35]:
model.fit_transform(trunc_texts)

array([[0.53279297, 0.22178771, 0.24541932],
       [0.32000544, 0.31194701, 0.36804755],
       [0.09642113, 0.64713961, 0.25643926],
       ...,
       [0.22011287, 0.71418488, 0.06570225],
       [0.33333333, 0.33333333, 0.33333333],
       [0.1666755 , 0.66656293, 0.16676158]])

In [36]:
topicnames = ["Topic" + str(i) for i in range(model.n_components)]
df_topic_keywords = pd.DataFrame(model.components_)
df_topic_keywords.index = topicnames

In [37]:
df_topic_keywords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Topic0,3221.641275,13.90055,252.072197,4561.217738,0.689225,2313.429881,7755.278838,50925.19452,22438.780692,0.503091,...,12138.675401,0.372382,149.696333,5574.545398,13910.776112,943.56491,38311.523138,0.390434,514.747026,35790.979774
Topic1,31520.905197,46902.174793,37671.072798,20616.913519,59121.459016,44313.344518,1192.785688,3322.214359,27226.062268,6.978639,...,16199.395535,0.377844,83087.187066,13733.858202,15621.449898,18898.726067,33.30602,0.427644,24638.608282,0.365689
Topic2,7122.242666,0.387045,5205.421969,7255.800658,0.60576,1160.768565,12867.949549,81.000347,10877.195454,28156.206564,...,84.356646,44424.899357,1.401618,33332.16514,2621.400201,26354.500437,0.372112,32224.182522,28034.024374,0.365734


In [38]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords


topic_keywords = show_topics(vectorizer=cv, lda_model=model, n_words=15)

In [39]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = [
    "Word " + str(i) for i in range(df_topic_keywords.shape[1])
]
df_topic_keywords.index = ["Topic " + str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,få,tid,gör,år,får,måst,säg,möj,fick,svar,eftersom,kräv,andr,fall,tack
Topic 1,kap,rätt,enl,gäll,del,fall,innebär,lag,dock,genom,andr,kräv,får,vänd,måst
Topic 2,person,svar,vän,hälsning,fått,vänd,tack,gör,hopp,fick,får,eftersom,andr,dock,säg
