# Topic modelling

We start with a classic model to then test the guided lda approach.

In [23]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")
import numpy as np

from sklearn.decomposition import LatentDirichletAllocation as lda
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

# import pyLDAvis.sklearn
# pyLDAvis.enable_notebook()

from src.dataset import Dataset
from src.vectorizers import TokenTfidfVectorizer

In [3]:
dataset = Dataset(dataset_path="", save_path=f"../data/processed/tokenized_processed.json")
tokens = dataset.load_text_list(field_name="tokens", size=-1)

### Vectorize the documents
The vectorized is a tfidf one, we use the output to fit the lda model.

In [5]:
dv = TokenTfidfVectorizer(tokens)

vectors = dv.vectors()
dv.save_vectors_vectorizer(vectors)
print(f"Vocabulary length: {len(dv.vectorizer.vocabulary_)}")

Vocabulary length: 71597


## Classic LDA model

In [None]:
numTopics = 3
alpha = 50/numTopics
beta = 0.1

lda_model = lda(n_components = numTopics, 
                doc_topic_prior= alpha, 
                topic_word_prior = beta, 
                random_state=0)

lda_output = lda_model.fit_transform(vectors)

### Topics relevant words

In [32]:
n_top_words = 10
vocab = dv.vectorizer.get_feature_names()
topic_words = {}
for topic, comp in enumerate(lda_model.components_): 
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    topic_words[topic] = [vocab[i] for i in word_idx]
    
for topic, words in topic_words.items():
    print('Topic: %d' % topic)
    print('  %s' % ', '.join(words))

Topic: 0
  court, defendant, plaintiff, illinois, state, evidence, judgment, trial, opinion, case
Topic: 1
  court, defendant, illinois, plaintiff, trial, case, appellant, justice, evidence, mr.
Topic: 2
  defendant, court, plaintiff, illinois, opinion, mr., case, trial, deliver, make


## LSI

In [14]:
reverse_vocabulary = { dv.vectorizer.vocabulary_[k]:k for k in dv.vectorizer.vocabulary_}

In [15]:
model = LsiModel(vectors.transpose(), id2word=reverse_vocabulary, num_topics=numTopics) 
topics = model.get_topics()

In [16]:
topWords = []
for topicno in range(numTopics):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in model.show_topic(topicno, topn=30)], '\n')
    topWords.append([(x) for x, y in model.show_topic(topicno, topn=30)])
    
print(set.intersection(*map(set,topWords)))

Topic 0
[('defendant', 0.33), ('court', 0.28), ('plaintiff', 0.21), ('illinois', 0.18), ('trial', 0.14), ('case', 0.12), ('evidence', 0.11), ('state', 0.11), ('judgment', 0.1), ('appellant', 0.1), ('say', 0.1), ('n.e.2d', 0.09), ('jury', 0.09), ('make', 0.09), ('people', 0.08), ('would', 0.08), ('motion', 0.08), ('act', 0.08), ('such', 0.08), ('error', 0.08), ('app', 0.08), ('appellee', 0.07), ('order', 0.07), ('section', 0.07), ('may', 0.07), ('time', 0.07), ('property', 0.07), ('file', 0.07), ('3d', 0.07), ('contract', 0.07)] 

Topic 1
[('mr.', 0.41), ('justice', 0.38), ('opinion', 0.38), ('presiding', 0.36), ('deliver', 0.32), ('court', 0.27), ('publish', 0.26), ('full', 0.16), ('defendant', -0.13), ('mcsurely', 0.12), ('o’connor', 0.11), ('barnes', 0.11), ('matchett', 0.1), ('gridley', 0.09), ('scanlan', 0.06), ('friend', 0.06), ('plaintiff', -0.05), ('thomson', 0.04), ('illinois', -0.04), ('taylor', 0.04), ('wilson', 0.04), ('n.e.2d', -0.04), ('trial', -0.04), ('sullivan', 0.04), 