# Topic modeling notebook

This notebook details the steps taken to clean the data and run topic modeling

In [45]:
import gensim.corpora as corpora
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem.snowball import SwedishStemmer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Import data

In [46]:
df_full = pd.read_csv("../dataset/lawline_data.csv")

Make new column with lowercase texts and remove all whitespace plus tabs/newlines

In [47]:
# Get only questions(::2)/answers(1::2)
df = df_full.iloc[::2, :]

In [48]:
df["text_clean"] = df["text"].str.lower()
df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = [" ".join(str(item).split()) for item in df["text_clean"]]


Remove punctuation

In [49]:
df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")

  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[^\w\s]", "")


Remove numbers and replace more than one whitespace with single whitespace

In [50]:
df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")

  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("[0-9]", "")
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text_clean"] = df["text_clean"].str.replace("\s{2,}", " ")


In [51]:
# Removing stopwords function
def remove_stopwords(texts, stop_words):
    return [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]

Turn texts to list of words and remove stop words. Then turn texts to term document frequency corpus.

In [52]:
# Turn texts to list
list_texts = df["text_clean"].values.tolist()

In [126]:
stop_words = stopwords.words("swedish")
stop_words.extend([
    "hej", "ska", "in", "vill", "alltså", "lawline", "även",
    "kommer", "fråga", "finns", 'gör', 'får', 'få', 'gäll', 'tack',
    'dock', 'kap'
    ])

text_words = remove_stopwords(list_texts, stop_words)

Stem words

In [96]:
stemmer = SwedishStemmer()

text_stemmed = [[stemmer.stem(word) for word in doc] for doc in text_words]

In [127]:
docs_stemmed = [" ".join(doc) for doc in text_stemmed]

In [128]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.05, max_df=0.7,
                     stop_words=stop_words)

trunc_texts = cv.fit_transform(docs_stemmed)

Train LDA model on corpus with differing number of topics

In [129]:
# n_components = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]
n_components = [3, 4, 5, 6]

grid = {
    'doc_topic_prior': [.1, .5, 1, 5, 10],
    'topic_word_prior': [.1, .5, 1, 5, 10],
    'learning_decay': [.5, .7, 1]
}

lda_list = []
for n_topics in n_components:

    mod = LDA(
        max_iter=10,
        learning_method="online",
        random_state=42,
        batch_size=128,
        evaluate_every=-1,
        n_components=n_topics,
    )

    clf = GridSearchCV(mod, param_grid=grid, n_jobs=-1, verbose=1)

    clf.fit(trunc_texts)

    lda_list.append(clf)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
Fitting 5 folds for each of 75 candidates, totalling 375 fits


In [122]:
model = lda_list[4].best_estimator_

In [123]:
model.fit(trunc_texts)

In [None]:
perp_list = [mod.best_estimator_.fit(trunc_texts).perplexity(trunc_texts) for mod in lda_list]

In [None]:
perp_list

In [None]:
model = lda_list[1]

In [35]:
model.fit_transform(trunc_texts)

array([[0.53279297, 0.22178771, 0.24541932],
       [0.32000544, 0.31194701, 0.36804755],
       [0.09642113, 0.64713961, 0.25643926],
       ...,
       [0.22011287, 0.71418488, 0.06570225],
       [0.33333333, 0.33333333, 0.33333333],
       [0.1666755 , 0.66656293, 0.16676158]])

In [114]:
topicnames = ["Topic" + str(i) for i in range(model.n_components)]
df_topic_keywords = pd.DataFrame(model.components_)
df_topic_keywords.index = topicnames

In [109]:
df_topic_keywords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,261,262,263,264,265,266,267,268,269,270
Topic0,974.502029,2.74628,357.513824,735.799788,1014.209603,851.369281,2591.168847,1600.11336,753.179773,4785.990668,...,959.177653,1817.220477,785.68947,1.024256,1495.902653,1662.054941,1.07214,1.039218,3004.114444,724.660161
Topic1,660.930155,1.577175,1013.919953,5577.830733,30.584184,35.546267,51.941883,1396.325853,21.865761,12.259241,...,621.744963,10.356907,28.466301,1.087586,2640.487589,894.097232,239.676559,1.105247,17024.501757,159.023412
Topic2,1720.6696,9525.406639,5166.061231,25872.88432,5916.331441,6085.9453,14639.206932,15173.879441,4975.699913,20328.993597,...,2271.46792,1147.643647,4729.134523,1.062531,688.329004,3148.645345,1.039709,1.101836,15319.971674,4625.443772
Topic3,4504.077371,230.72813,2635.686287,9681.274296,1581.65153,614.515467,3774.008613,2385.46576,1301.985113,1975.482077,...,1760.220742,5946.673809,1201.536211,16942.225886,4649.365067,932.567511,20811.486421,12403.438164,446.123321,1766.062687


In [115]:
def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [124]:
topic_keywords = show_topics(vectorizer=cv, lda_model=model, n_words=15)

In [125]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = [
    "Word " + str(i) for i in range(df_topic_keywords.shape[1])
]
df_topic_keywords.index = ["Topic " + str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,sälj,köp,bil,fel,köpt,huset,kräv,rätt,kunn,ans,bord,fall,enl,eftersom,skick
Topic 1,fast,skuld,egendom,gåv,kap,kr,betal,del,tillgång,skall,genom,innebär,äger,räkn,värd
Topic 2,betal,år,lägen,tid,peng,sambo,tillbak,säg,rätt,flytt,fått,fick,dag,månad,sen
Topic 3,person,kap,brott,fall,uppgift,brottet,polis,enl,vänd,svar,domstol,döm,andr,år,ans
Topic 4,barnet,barn,sver,föräldr,kap,beslut,rätt,bäst,svensk,domstol,båd,bor,andr,hos,vänd
Topic 5,mak,egendom,bodelning,barn,kap,rätt,del,testament,arv,ärv,enskild,äb,arvet,ärvdabalk,enl
Topic 6,rätt,lag,avtal,enl,vänd,arbetsgiv,fall,svar,kap,avtalet,innebär,måst,kräv,säg,vän
