# About

## Author

    Thiago Luis Rodrigues Pinho @AILAB 

## Start Date

    30/04/2020

## Objectives

    Use an LDA components analysis for text documents and then evaluate their
    similarities using Jensen-Shannon Distance

## Summary


# [ALEI] IAgrupador LDA with Jensen-Shannon distance

# About

## Author

    Thiago Luis Rodrigues Pinho @AILAB 

## Start Date

    30/04/2020

## Objectives

    Use an LDA components analysis for text documents and then evaluate their
    similarities using Jensen-Shannon Distance

## Summary


# [ALEI] IAgrupador LDA with Jensen-Shannon distance

In [19]:
from time import time

import gensim
from gensim import corpora, models
from fastparquet import write
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns

%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set()

## 1. Loading data

In [4]:
%%time 
RELATIVE_FOLDER_PATH = "./data/"
VECTOR_MODEL_NAME = "pt_core_news_sm"

filename = "data_preprocessed"

parquet_filename = RELATIVE_FOLDER_PATH + filename + ".parquet.gzip"
ailab_df = pd.read_parquet(parquet_filename, columns=['text', 'process_id'])
print(ailab_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2036 entries, 0 to 2035
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        2036 non-null   object
 1   process_id  2036 non-null   object
dtypes: object(2)
memory usage: 31.9+ KB
None
CPU times: user 3.17 s, sys: 147 ms, total: 3.31 s
Wall time: 3.93 s


## 2. Treating Data

In [5]:
num_samples = 100
sample_df = ailab_df.sample(n=num_samples)
sample_df = sample_df.drop_duplicates(subset='process_id')
print(sample_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87 entries, 2027 to 334
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        87 non-null     object
 1   process_id  87 non-null     object
dtypes: object(2)
memory usage: 2.0+ KB
None


In [8]:
tokenized_docs = [ document_text.split() for document_text in sample_df['text'].to_list()]

[['flavio',
  'roberto',
  'ferreira',
  'lima',
  'juiz',
  'federal',
  'relatoria',
  'entrou',
  'vigor',
  'ST_LEI_978499',
  'ST_ARTIGO_54',
  'ficou',
  'induvidoso',
  'revisao',
  'atos',
  'administrativos',
  'estariam',
  'limitados',
  'prazo',
  '5cinco',
  'anos',
  'salvo',
  'comprovada',
  'mafe',
  'certo',
  'stfmandado',
  'seguranca',
  '25409df',
  'decidiu',
  'administrativo',
  'concessao',
  'aposentadoria',
  'uniao',
  'completa',
  'registro',
  'tratar',
  'complexo',
  'sentido',
  'expediu',
  'sumula',
  'seguinte',
  'teor',
  'atos',
  'aposentadoria',
  'reforma',
  'pensao',
  'natureza',
  'juridica',
  'atos',
  'complexos',
  'razao',
  'prazos',
  'decadenciais',
  'referem',
  'ST_ARTIGO_260',
  'regimento',
  'interno',
  'ST_ARTIGO_54',
  'ST_LEI_978499',
  'comecam',
  'fluir',
  'aperfeicoam',
  'decisao',
  'considera',
  'legais',
  'ilegais',
  'respectivamente',
  'vejo',
  'exercicio',
  'ST_ARTIGO_71',
  'fiscalizar',
  'administrati

# 3. Generating LDA Component Analysis

In [9]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [12]:
tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*"bento" + 0.000*"aliquotas" + 0.000*"expedidaicertificada" + 0.000*"portaria" + 0.000*"artur" + 0.000*"fies" + 0.000*"fechado" + 0.000*"erechim" + 0.000*"paulo" + 0.000*"status"
Topic: 1 Word: 0.001*"fcvs" + 0.001*"vargas" + 0.000*"eletrica" + 0.000*"energia" + 0.000*"gdaj" + 0.000*"uniao" + 0.000*"execucao" + 0.000*"receitas" + 0.000*"cujus" + 0.000*"heranca"
Topic: 2 Word: 0.001*"capital" + 0.001*"adicional" + 0.000*"abono" + 0.000*"gratificacao" + 0.000*"andresa" + 0.000*"santa" + 0.000*"catarina" + 0.000*"bernardo" + 0.000*"httpesaj" + 0.000*"remuneracao"
Topic: 3 Word: 0.000*"uniao" + 0.000*"postal" + 0.000*"informado" + 0.000*"receita" + 0.000*"teto" + 0.000*"pensao" + 0.000*"carnes" + 0.000*"icms" + 0.000*"estavel" + 0.000*"prescricao"
Topic: 4 Word: 0.003*"evento" + 0.002*"usuario" + 0.002*"gerada" + 0.001*"separacao" + 0.001*"automaticamente" + 0.001*"pagina" + 0.001*"refer" + 0.001*"foiram" + 0.001*"assinados" + 0.001*"juntados"
Topic: 5 Word: 0.001*"faiz

In [18]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(unseen_document.split())
for index, score in lda_model_tfidf[bow_vector]:
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.10000000149011612	 Topic: 0.001*"bento" + 0.000*"aliquotas" + 0.000*"expedidaicertificada" + 0.000*"portaria" + 0.000*"artur"
Score: 0.10000000149011612	 Topic: 0.001*"fcvs" + 0.001*"vargas" + 0.000*"eletrica" + 0.000*"energia" + 0.000*"gdaj"
Score: 0.10000000149011612	 Topic: 0.001*"capital" + 0.001*"adicional" + 0.000*"abono" + 0.000*"gratificacao" + 0.000*"andresa"
Score: 0.10000000149011612	 Topic: 0.000*"uniao" + 0.000*"postal" + 0.000*"informado" + 0.000*"receita" + 0.000*"teto"
Score: 0.10000000149011612	 Topic: 0.003*"evento" + 0.002*"usuario" + 0.002*"gerada" + 0.001*"separacao" + 0.001*"automaticamente"
Score: 0.10000000149011612	 Topic: 0.001*"faiz" + 0.001*"remessa" + 0.001*"moura" + 0.001*"transmissao" + 0.001*"externa"
Score: 0.10000000149011612	 Topic: 0.000*"aposentadoria" + 0.000*"fonseca" + 0.000*"importacao" + 0.000*"cofins" + 0.000*"invalidez"
Score: 0.10000000149011612	 Topic: 0.001*"evento" + 0.001*"usuario" + 0.000*"pagina" + 0.000*"secjf" + 0.000*"defen