# Topic modeling using Latent Dirichlet Allocation (LDA)

In [6]:
import numpy as np
from gensim.corpora import Dictionary
from tqdm.notebook import tqdm
import os
import json
import warnings

In [7]:
warnings.filterwarnings('ignore')

## I. Corpora

In [8]:
class JSONCorpus:
    
    def __init__(self, dpath):
        self.dpath = dpath
        self.dictionary = Dictionary(self._gen_documents())
        
    def _gen_documents(self):
        # An auxiliary generator
        for fname in os.listdir(self.dpath):
            with open(os.path.join(self.dpath, fname), 'r', encoding='utf-8') as file:
                tokenized_doc = json.load(file)   
                yield tokenized_doc
        
    def __iter__(self):
        for doc in self._gen_documents():
            yield doc
            
class BoWCorpus:
    
    def __init__(self, corpus, dictionary):
        self.corpus = corpus
        self.dictionary = dictionary
        
    def __iter__(self):
        for doc in self.corpus:
            yield self.dictionary.doc2bow(doc)

In [9]:
DIR_PREPROCESSED_DATA = 'preprocessed_data'

In [10]:
corpus = JSONCorpus(DIR_PREPROCESSED_DATA)

In [11]:
MIN_DF = 5
MAX_DF_RATIO = 0.5

num_orig = len(corpus.dictionary)
corpus.dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF_RATIO, keep_n=None)

print(f'Number of tokens before filtering: {num_orig}')
print(f'Total number of filtered tokens: {num_orig - len(corpus.dictionary)}')
print(f'Number of tokens after filtering: {len(corpus.dictionary)}')

Number of tokens before filtering: 85940
Total number of filtered tokens: 56915
Number of tokens after filtering: 29025


In [12]:
bow_corpus = BoWCorpus(corpus, corpus.dictionary)

## II. LDA model

In [13]:
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
import gensim

In [9]:
# LDA model parameters
UPDATE_EVERY = 1 # Online learning
NUM_PASSES = 5 # Sufficient - selected during convergence monitoring
NUM_ITERATIONS = 200 # Sufficient - selected during convergence monitoring
CHUNK_SIZE = 2000
RANDOM_STATE = 42
LIST_NUM_TOPICS = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# Coherence model parameters
COH_METRIC = 'c_v'
COH_NUM_PROCESSES = 16

In [None]:
coherence_scores = {}

for num_topics in tqdm(LIST_NUM_TOPICS):
    lda_model = LdaModel(corpus=bow_corpus,
                         id2word=corpus.dictionary, 
                         num_topics=num_topics, 
                         passes=NUM_PASSES, 
                         iterations=NUM_ITERATIONS,
                         chunksize=CHUNK_SIZE, 
                         random_state=RANDOM_STATE, 
                         update_every=UPDATE_EVERY)
    coherence_model = CoherenceModel(lda_model, texts=corpus, dictionary=corpus.dictionary, coherence=COH_METRIC, processes=COH_NUM_PROCESSES)
    coherence_scores[num_topics] = coherence_model.get_coherence()
    print(f'Num topics: {num_topics} | Coherence: {coherence_scores[num_topics]}')

In [10]:
NUM_TOPICS = 30
#NUM_TOPICS = max(coherence_scores, key=coherence_scores.get)
print(f'Recommended number of topics (based on Coherence score): {NUM_TOPICS}')

Recommended number of topics (based on Coherence score): 30


In [None]:
x = list(coherence_scores.keys())
y = list(coherence_scores.values())


plt.figure(figsize=(12, 8))
plt.plot(x, y)
plt.title('Coherence score vs. number of topics')
plt.xlabel('Number of topics')
plt.ylabel('Coherence score')
plt.show()

In [11]:
final_model = LdaModel(corpus=bow_corpus,
                       id2word=corpus.dictionary,  
                       num_topics=NUM_TOPICS,
                       passes=NUM_PASSES,
                       iterations=NUM_ITERATIONS,
                       chunksize=CHUNK_SIZE,
                       random_state=RANDOM_STATE,
                       update_every=UPDATE_EVERY)

In [14]:
final_model = gensim.models.ldamodel.LdaModel.load('models/gensim/lda/lda_30/lda_30.model')

In [15]:
#vypsat témata
for idx, topic in final_model.show_topics(formatted=False, num_words= 10, num_topics=50):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: obviněný|čin|trestný|trestní|jednání|skutek|znak|zákoník|pachatel|podstata
Topic: 1 
Words: společnost|obchodní|člen|ob|družstvo|valný|hromada|společník|jednatel|představenstvo
Topic: 2 
Words: pohledávka|věřitel|dlužník|zástavní|závazek|dluh|smlouva|úkon|postoupení|výše
Topic: 3 
Words: advokát|žalobce|zástupce|poplatek|nedostatek|podání|zastoupení|právnický|vzdělání|osvobození
Topic: 4 
Words: přípustnost|dovolatelka|uveřejněný|praxe|řešení|rozhodovací|procesní|ustálený|číslo|stanovisko
Topic: 5 
Words: pozemek|nemovitost|žalobce|vlastník|vlastnický|vlastnictví|katastrální|předmětný|parc|stavba
Topic: 6 
Words: smlouva|kupní|smluvní|cena|uzavřený|úkon|platný|strana|platnost|dohoda
Topic: 7 
Words: služba|vojenský|pravomoc|mezinárodní|český|článek|republika|bývalý|svoboda|povinnost
Topic: 8 
Words: žalobkyně|částka|nárok|žaloba|výše|zaplacení|prodlení|plnění|úrok|doba
Topic: 9 
Words: stát|majetek|správní|orgán|nárok|vydání|úřad|státní|veřejný|dražba
Topic: 10 
Words:

In [23]:
#vypsat téma dokumentu, který se převede na doc2bow
rozhodnuti_path = "C:/Users/novotte5/Disk Google/PrF MUNI/Dr/textanalysis/caselawanalysis/03_muni_research/preprocessed_data/rozhodnuti-29_Cdo_3050_2015.json"
with open(rozhodnuti_path, 'r', encoding='utf-8') as file:
    rozhodnuti = json.load(file)
    rozhodnuti_bow = bow_corpus.dictionary.doc2bow(rozhodnuti)
    list_topics = final_model.get_document_topics(rozhodnuti_bow)
    #print(list_topics)
    #print(final_model[rozhodnuti_bow]) #jiné způsoby vypsání
    #print(final_model.show_topic(29))
    
#seřadit od nejpravděpodobnějších    
def Sort_Tuple(list_topics, reverse = True):
    list_topics.sort(key = lambda x: x[1])  
    return list_topics
print(Sort_Tuple(list_topics))    

[(7, 0.015429493), (23, 0.017726526), (28, 0.048715953), (24, 0.05144313), (0, 0.056889653), (2, 0.06900414), (22, 0.06902706), (29, 0.0909741), (6, 0.09938011), (16, 0.10015463), (13, 0.10404118), (12, 0.11528685), (26, 0.15590431)]


## III. Model evaluation

In [16]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from wordcloud import WordCloud
import pyLDAvis
import pyLDAvis.gensim

In [17]:
pyLDAvis.enable_notebook()

In [18]:
def get_dominant_topic(doc_bow):
    """Returns a list of the most dominant topics based on topic probability, one per each document from the input corpus. key=lambda x: x[1]  
    """
    return sorted(final_model.get_document_topics(doc_bow, minimum_probability=0.0), reverse=True)

def create_df(corpus, num_documents):
    """Creates an auxiliary data frame which holds data for further visualizations.
    """
    words = []
    word_counts = []
    dominant_topics = []
    
    for doc in tqdm(corpus, total=num_documents):
        doc_bow = corpus.dictionary.doc2bow(doc)
        words.append(doc)
        word_counts.append(len(doc))
        dominant_topics.append(get_dominant_topic(doc_bow))
        
    return pd.DataFrame({'words': words, 
                         'word_count': word_counts, 
                         'dominant_topic': dominant_topics})

In [19]:
df_documents = create_df(corpus=corpus, num_documents=corpus.dictionary.num_docs)

HBox(children=(FloatProgress(value=0.0, max=111187.0), HTML(value='')))




In [20]:
df_documents.head(5)

Unnamed: 0,words,word_count,dominant_topic
0,"[den, policejní, orgán, policie, útvar, odhalo...",272,"[(29, 0.11127229), (28, 0.0001544528), (27, 0...."
1,"[vztahující, skutkův, zjištění, skutkový, věta...",18199,"[(29, 0.12763977), (28, 0.02659887), (27, 0.01..."
2,"[návrh, soud, porušení, podmínka, výkon, trest...",18218,"[(29, 0.051871605), (28, 0.008504843), (27, 0...."
3,"[dohoda, směnečný, vyplňovací, prohlášení, spo...",1109,"[(29, 0.052658044), (28, 0.021583317), (27, 0...."
4,"[rozhodnutí, soud, insolvenční, řízení, vedený...",669,"[(29, 0.09750162), (28, 8.6947766e-05), (27, 0..."


In [22]:
df_documents.to_csv('dominant_topics.csv', encoding = 'utf-8')

### Distribution of document word counts

In [None]:
plt.figure(figsize=(14, 7))
sns.distplot(df_documents['word_count'])
plt.title('Distribution of document word counts')
plt.xlabel('Word count')
plt.show()

### Distribution of topic sizes

In [None]:
plt.figure(figsize=(14, 7))
sns.countplot(df_documents['dominant_topic'])
plt.title('Number of documents per topic')
plt.xlabel('Topic ID')
plt.show()

In [None]:
counter = Counter(df_documents['dominant_topic'])

In [None]:
MC_6_TOPICS = [x[0] for x in counter.most_common(6)]
LC_6_TOPICS = [x[0] for x in counter.most_common()[-1:-7:-1]]

### Distributions of word counts across 6 most frequent topics

In [None]:
nrow = 2
ncol = 3

fig, ax = plt.subplots(nrow, ncol, figsize=(23, 10), sharex='row')

for i in range(nrow):
    for j in range(ncol):
        topic_id = MC_6_TOPICS[(i+1) * j]
        ax[i, j].set_title(f'Topic: {topic_id}')
        sns.distplot(df_documents.query(f'dominant_topic == {topic_id}')['word_count'], ax=ax[i, j])
        ax[i, j].set_xlabel('Word count')

### Distributions of word counts across 6 least frequent topics

In [None]:
nrow = 2
ncol = 3

fig, ax = plt.subplots(nrow, ncol, figsize=(23, 10), sharex='row')

for i in range(nrow):
    for j in range(ncol):
        topic_id = LC_6_TOPICS[(i+1) * j]
        ax[i, j].set_title(f'Topic: {topic_id}')
        sns.distplot(df_documents.query(f'dominant_topic == {topic_id}')['word_count'], ax=ax[i, j])
        ax[i, j].set_xlabel('Word count')

### Word clouds for 6 most frequent topics

In [None]:
nrow = 2
ncol = 3
fig, ax = plt.subplots(nrow, ncol, figsize=(25, 10), sharex='row')
aux_id = 0

for i in range(nrow):
    for j in range(ncol):
        topic_id = MC_6_TOPICS[(aux_id)]
        aux_id += 1
        wordcloud = WordCloud(background_color='white', collocations=False, max_words=20).generate_from_frequencies(dict(final_model.show_topic(topic_id, topn=20)))
        ax[i, j].set_title(f'Topic: {topic_id}')
        ax[i, j].imshow(wordcloud)
        ax[i, j].axis('off')

### Word clouds for 6 least frequent topics

In [None]:
nrow = 2
ncol = 3
fig, ax = plt.subplots(nrow, ncol, figsize=(25, 10), sharex='row')
aux_id = 0

for i in range(nrow):
    for j in range(ncol):
        topic_id = LC_6_TOPICS[(aux_id)]
        aux_id += 1
        wordcloud = WordCloud(background_color='white', collocations=False, max_words=20).generate_from_frequencies(dict(final_model.show_topic(topic_id, topn=20)))
        ax[i, j].set_title(f'Topic: {topic_id}')
        ax[i, j].imshow(wordcloud)
        ax[i, j].axis('off')

### Word clouds for all topics

In [None]:
# 30 topics

nrow = 5
ncol = 6
fig, ax = plt.subplots(nrow, ncol, figsize=(60, 40), sharex='row', constrained_layout=True)
aux_id = 0

topic_ids = range(30)

for i in range(nrow):
    for j in range(ncol):
        topic_id = topic_ids[(aux_id)]
        aux_id += 1
        wordcloud = WordCloud(background_color='white', collocations=False, max_words=20).generate_from_frequencies(dict(final_model.show_topic(topic_id, topn=20)))
        ax[i, j].set_title(f'Topic: {topic_id}', fontsize=45)
        ax[i, j].imshow(wordcloud)
        ax[i, j].axis('off')
plt.savefig(f'word_clouds_lda_{NUM_TOPICS}.png') # Save into a file