# Topic modeling notebook

This notebook details the steps taken to clean the data and run topic modeling

In [6]:
import pandas as pd
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from nltk.stem.snowball import SwedishStemmer
import gensim.corpora as corpora
from sklearn.decomposition import LatentDirichletAllocation as LDA
import numpy as np

Import data

In [7]:
df = pd.read_csv('../dataset/lawline_data.csv')

Make new column with lowercase texts and remove all whitespace plus tabs/newlines

In [8]:
df['text_clean'] = df['text'].str.lower()
df['text_clean'] = [' '.join(str(item).split()) for item in df['text_clean']]

Remove punctuation

In [9]:
df['text_clean'] = df['text_clean'].str.replace('[^\w\s]', '')

  df['text_clean'] = df['text_clean'].str.replace('[^\w\s]', '')


Remove numbers and replace more than one whitespace with single whitespace

In [10]:
df['text_clean'] = df['text_clean'].str.replace('[0-9]', '')
df['text_clean'] = df['text_clean'].str.replace('\s{2,}', ' ')

  df['text_clean'] = df['text_clean'].str.replace('[0-9]', '')
  df['text_clean'] = df['text_clean'].str.replace('\s{2,}', ' ')


In [11]:
# Removing stopwords function
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stopwords.words('swedish')] for doc in texts]

Turn texts to list of words and remove stop words. Then turn texts to term document frequency corpus.

In [12]:
# Turn texts to list
list_texts = df['text_clean'].values.tolist()

In [13]:
# Almost 1h runtime, keep in mind FUTURE SAM
text_words = remove_stopwords(list_texts)

Stem words

In [14]:
stemmer = SwedishStemmer()

text_stemmed = [[stemmer.stem(word) for word in doc] for doc in text_words]

In [15]:
docs_stemmed = [' '.join(doc) for doc in text_stemmed]

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.15,
                     max_df=0.85,
                     stop_words=stopwords.words('swedish')) 

trunc_texts = cv.fit_transform(list_texts)

In [17]:
# Create Dictionary
id2word = corpora.Dictionary(text_words)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in text_words]

Train LDA model on corpus with differing number of topics

In [18]:
lda_list = []
num_topics = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20, 25, 30, 40]

# num_topics = [3, 5]

for num_topic in num_topics:
    mod = LDA(
    n_components=num_topic,
    max_iter=10,
    learning_method='online',
    random_state=42,
    batch_size=128,
    evaluate_every = -1,
    n_jobs = -1,
    )

    lda_output = mod.fit_transform(trunc_texts)

    lda_list.append(lda_output)

In [19]:
lda_output = lda_list[8].transform(trunc_texts)

AttributeError: 'numpy.ndarray' object has no attribute 'transform'

In [1]:
topicnames = ["Topic" + str(i) for i in range(lda_list[8].n_components)]
df_topic_keywords = pd.DataFrame(lda_list[8].components_)
df_topic_keywords.index = topicnames

NameError: name 'lda_list' is not defined

In [129]:
df_topic_keywords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
Topic0,5871.816794,13542.107684,220.594619,19.643467,0.20256,2713.866676,64624.916144,4.468873,0.201876,0.203849,...,3265.936858,0.204972,0.203498,10161.768328,11699.246787,0.202985,0.202952,0.204174,4648.204633,0.202069
Topic1,708.454248,12.386855,3116.771145,2154.724238,0.202525,5644.484185,0.20232,0.203862,0.203026,0.203675,...,0.204499,4109.170175,7190.209509,21785.947751,4201.598161,0.202438,0.203081,0.203212,5857.835979,69071.164685
Topic2,38713.459305,35898.068099,21283.668229,25749.805481,32136.265248,4555.406223,0.201502,11065.217986,0.2032,24971.19459,...,11073.5194,32901.269576,19757.106105,9308.367518,11375.06958,31998.10761,0.203442,18151.01624,46047.400258,0.202043
Topic3,2844.037298,6745.410717,4517.462624,2838.700465,0.202505,11895.662489,0.201756,12261.841313,36801.035477,2858.729635,...,24097.68102,3028.596682,5257.5533,0.211401,47532.637815,0.203085,0.202873,0.203091,12850.682085,0.202142
Topic4,18937.636358,3954.529264,3065.453865,3986.311807,0.203946,661.583632,0.202257,2770.731396,0.202631,0.204175,...,2862.152624,22925.617532,0.203366,0.204129,115.516448,20380.07298,25108.848906,3229.011275,6572.790542,0.201624


In [130]:

def show_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=cv, lda_model=lda_list[1], n_words=15)

In [131]:
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,rätt,barn,få,del,får,finns,andra,går,kommer,vill,två,första,alltså,ska,eftersom
Topic 1,år,hej,två,gäller,fick,står,fråga,fått,finns,innan,får,tid,svar,samt,även
Topic 2,ska,enligt,kap,fall,finns,fråga,dock,se,gäller,även,får,måste,alltså,innebär,andra
Topic 3,ska,vill,in,betala,göra,kommer,få,hej,ta,får,måste,hos,gå,går,gör
Topic 4,kap,egendom,ska,fråga,vänlig,hälsning,kommer,lawline,hej,tack,enligt,genom,vänder,svar,innebär


In [105]:
topicnames

['Topic0', 'Topic1', 'Topic2', 'Topic3']

In [73]:
lda_list = []
num_topics = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20, 25, 30, 40]

for num_topic in num_topics:
    mod = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topic)
    lda_list.append(mod)

In [75]:
mod_viz = lda_list[7]

topics = mod_viz.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.018*"ska" + 0.013*"kap" + 0.008*"finns" + 0.008*"fråga" + 0.007*"rätt" + 0.006*"kommer" + 0.006*"vänder" + 0.006*"få" + 0.005*"även" + 0.005*"tack"')
(1, '0.018*"ska" + 0.011*"kap" + 0.009*"fråga" + 0.008*"finns" + 0.007*"rätt" + 0.006*"får" + 0.006*"hej" + 0.006*"svar" + 0.005*"år" + 0.005*"enligt"')
(2, '0.017*"ska" + 0.010*"kap" + 0.009*"kommer" + 0.009*"fråga" + 0.009*"rätt" + 0.008*"egendom" + 0.008*"finns" + 0.008*"får" + 0.008*"få" + 0.007*"barn"')
(3, '0.015*"ska" + 0.009*"får" + 0.008*"kap" + 0.008*"rätt" + 0.007*"fråga" + 0.006*"finns" + 0.005*"hej" + 0.005*"lawline" + 0.005*"innebär" + 0.005*"fel"')
(4, '0.012*"ska" + 0.010*"kap" + 0.008*"finns" + 0.007*"får" + 0.007*"fråga" + 0.006*"rätt" + 0.005*"kommer" + 0.005*"lawline" + 0.005*"hej" + 0.005*"även"')
(5, '0.017*"ska" + 0.009*"kap" + 0.008*"fråga" + 0.007*"rätt" + 0.006*"finns" + 0.006*"lawline" + 0.006*"vill" + 0.005*"enligt" + 0.005*"fall" + 0.005*"avtalet"')
(6, '0.010*"ska" + 0.008*"kommer" + 0.008*"fråga" + 0.