In [23]:
import gensim
import pandas as pd
import spacy
import nltk
import pyLDAvis
import pyLDAvis.gensim
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

In [24]:
nlp = spacy.load('pt_core_news_sm')

nltk.download('stopwords')
stop_words = stopwords.words('portuguese')  

stop_words.extend(['ir', 'aqui', 'ter', 'todo', 'fazer', 'dizer', 'falar', 'estar', 'hoje', 'algum', 'outro', 'ser',
                   'querer', 'qualquer', 'nado', 'porque', 'vir', 'partir', 'governar', 'deputar', 'parlamentar', 'sr',
                   'presidente', 'vice', 'discursar', 'parecer', 'vez', 'dar', 'ex', 'sim', 'levar', 'quase', 'chance',
                   'ano', 'além', 'sob', 'termo', 'sempre', 'nenhum', 'coisa', 'frase', 'diverso', 'olhar', 'exas',
                   'aliás', 'ficar', 'tanto', 'saber', 'colocar', 'tão', 'dia', 'senhor', 'então', 'tipo', 'lado',
                   'palavra', 'gente', 'apresentar', 'continuar', 'lá', 'nº', 'nome', 'exª', 'ali', 'câmara',
                   'comissão'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tsukasa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
def tokenization(texts_list):
    for text in texts_list:
        yield (gensim.utils.simple_preprocess(str(text), deacc=False))


def remove_stopwords(matrix):
    return [[word for word in simple_preprocess(str(line)) if word not in stop_words] for line in matrix]


def lemmatization(matrix):
    matrix_out = []
    for line in matrix:
        doc = nlp(" ".join(line))
        matrix_out.append([word.lemma_ for word in doc])
    return matrix_out


def n_grams(matrix):
    n_grams_model = gensim.models.Phrases(matrix, min_count=2, threshold=10)
    matrix_out = gensim.models.phrases.Phraser(n_grams_model)
    return [matrix_out[line] for line in matrix]


def create_dictionary(matrix):
    return Dictionary(matrix)


def create_corpus(id2word, matrix):
    return [id2word.doc2bow(line) for line in matrix]


def show_keywords(dictionary, corpus):
    return [[(dictionary[word], frequency) for word, frequency in document] for document in corpus]

In [35]:
df = pd.read_json('database/Joice Hasselmann Plenario 2019.json', encoding="utf8")
database = df.discursos.values.tolist()

data_processing = list(tokenization(database))

data_processing = remove_stopwords(data_processing)

data_processing = lemmatization(data_processing)

data_processing = remove_stopwords(data_processing)

data_processing = n_grams(data_processing)

data_processing = n_grams(data_processing)

data_processing = remove_stopwords(data_processing)

dictionary = create_dictionary(data_processing) 

dictionary.filter_extremes(no_below=2)

corpus = create_corpus(dictionary, data_processing)

keywords = show_keywords(dictionary, corpus)

keywords[0][:5] # Show 5 words and frequency of the first document

[('abrir', 1),
 ('absolutamente', 1),
 ('acordar', 6),
 ('ainda', 2),
 ('andar', 1)]

In [27]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=200,
                                            random_state=100, chunksize=5)

lda_model.show_topics(num_words=10, formatted=False)

[(0,
  [('previdência', 0.05273116),
   ('pobre', 0.03654498),
   ('novo', 0.035415255),
   ('chegar', 0.023218049),
   ('texto', 0.021028133),
   ('contar', 0.02099908),
   ('receber', 0.019363593),
   ('haver', 0.015624786),
   ('pagar', 0.013982807),
   ('fato', 0.013751401)]),
 (1,
  [('jovem', 0.03749265),
   ('criança', 0.035909295),
   ('oportunidade', 0.018858256),
   ('menino', 0.018857466),
   ('errar', 0.018857466),
   ('prazo', 0.016572405),
   ('cumprir', 0.015104181),
   ('joice', 0.015074832),
   ('trabalhar', 0.01466136),
   ('único', 0.013966279)]),
 (2,
  [('pedir', 0.038542446),
   ('oposição', 0.027283832),
   ('acordar', 0.025710754),
   ('líder', 0.02259771),
   ('importante', 0.02137309),
   ('pautar', 0.020666208),
   ('discussão', 0.020285219),
   ('caminhar', 0.016633602),
   ('pagar', 0.015073292),
   ('pavimentar', 0.014475642)]),
 (3,
  [('quebrar', 0.046635587),
   ('previdência', 0.016254699),
   ('melhor', 0.016033946),
   ('idear', 0.015991375),
   ('jo