# 1. Load the packages

In [21]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import stopwordsiso

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

# 2. Import Newsgroups Text Data

In [6]:
# Import Dataset
df = pd.read_csv('./all_together.csv')
df.head()

Unnamed: 0,label,file,link,category,emotiveness,errors,pausatility,text,tokens,words in upper case,average sentence length,nouns,verbs,adverbs,adjectives,pronouns,date
0,fake,1.txt,ceticismopolitico.com,politica,0.263158,0.0,2.0,"katia abreu diz vai colocar expulsao moldura, ...",211.0,6.0,14.2308,46.0,30.0,13.0,7.0,26.0,2017-11-30
1,fake,10.txt,ceticismopolitico.com,politica,0.241667,0.007874,2.5,"dr. ray peita bolsonaro, chama-o conservador f...",289.0,0.0,18.1429,64.0,56.0,18.0,11.0,20.0,2017-11-24
2,fake,100.txt,afolhabrasil.com.br,politica,0.12782,0.003636,1.8125,reinaldo azevedo desmascarado policia federal....,304.0,0.0,17.1875,88.0,45.0,8.0,9.0,18.0,2017-05-23
3,fake,1000.txt,diariodobrasil.org,politica,0.229008,0.001748,2.68,relatorio assustador bndes mostra dinheiro pub...,639.0,14.0,22.88,175.0,87.0,21.0,39.0,34.0,2017-07-24
4,fake,1001.txt,diariodobrasil.org,politica,0.269231,0.0,0.894737,"radialista americano fala sobre pt: ""eles vend...",128.0,1.0,5.84211,31.0,21.0,8.0,6.0,12.0,2017-07-25


In [8]:
data = df["text"]
data

0       katia abreu diz vai colocar expulsao moldura, ...
1       dr. ray peita bolsonaro, chama-o conservador f...
2       reinaldo azevedo desmascarado policia federal....
3       relatorio assustador bndes mostra dinheiro pub...
4       radialista americano fala sobre pt: "eles vend...
                              ...                        
7195    para jornal britanico, acao contra lula lava j...
7196    temer diz acionou pf cade investigar aumentos ...
7197    os obstaculos politicos temer 2017. especialis...
7198    sexta-feira, 15 setembro 2017. boa noite! aqui...
7199    'nao envolvo politica', diz brasileiro preso v...
Name: text, Length: 7200, dtype: object

# 3. Tokenize and Clean-up using gensim’s simple_preprocess()

In [9]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  

data_words = list(sent_to_words(data))
data_words[0]

[['katia',
  'abreu',
  'diz',
  'vai',
  'colocar',
  'expulsao',
  'moldura',
  'reclamar',
  'senadora',
  'katia',
  'abreu',
  'sem',
  'partido',
  'to',
  'disse',
  'expulsao',
  'pmdb',
  'resultado',
  'acao',
  'cupula',
  'atual',
  'legenda',
  'que',
  'segundo',
  'ela',
  'oportunista',
  'amanha',
  'vou',
  'botar',
  'moldura',
  'dourada',
  'expulsao',
  'porque',
  'maos',
  'onde',
  'veio',
  'atestado',
  'boa',
  'conduta',
  'curriculo',
  'essas',
  'pessoas',
  'expulsaram',
  'servem',
  'pais',
  'eles',
  'servem',
  'pais',
  'beneficios',
  'proprios',
  'disse',
  'katia',
  'abreu',
  'ue',
  'expulsao',
  'algo',
  'tao',
  'bom',
  'curriculo',
  'tanta',
  'choradeira',
  'katia',
  'sabemos',
  'motivo',
  'provavelmente',
  'katia',
  'valor',
  'pt',
  'partido',
  'deveria',
  'te',
  'la',
  'absorvido',
  'ao',
  'parece',
  'pt',
  'gostava',
  'katia',
  'somente',
  'ficasse',
  'entrincheirada',
  'dentro',
  'pmdb',
  'ou',
  'seja',
  

# 4. Lemmatization

Lemmatization is a process where we convert words to its root word.

For example: ‘Studying’ becomes ‘Study’, ‘Meeting becomes ‘Meet’, ‘Better’ and ‘Best’ becomes ‘Good’.

The advantage of this is, we get to reduce the total number of unique words in the dictionary. As a result, the number of columns in the document-word matrix (created by CountVectorizer in the next step) will be denser with lesser columns.

## If you're unable to run the cell below, make sure you have pt_core_news_sm from spacy.

### To download it, run `python -m spacy download pt_core_news_sm`

In [28]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'pt_core_news_sm' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download pt_core_news_sm
nlp = spacy.load('pt_core_news_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized[0]


KeyboardInterrupt: 

## Remove stopwords

In [None]:
stops = set(stopwordsiso.stopwords("pt"))
def remove_stopwords(text: str) -> str:
    # print(text)
    return " ".join([t for t in text.split(" ") if t not in stops])

print(data_lemmatized)
data_lemmatized = [remove_stopwords(text) for text in data_lemmatized[:2]]
# data_lemmatized[0]

# 5. Create the Document-Word matrix

The LDA topic model algorithm requires a document word matrix as the main input.

You can create one using CountVectorizer. 

We configured the CountVectorizer to consider words that has occurred at least 10 times (min_df), remove built-in portuguese stopwords, convert all words to lowercase, and a word can contain numbers and alphabets of at least length 2 in order to be qualified as a word.

So, to create the doc-word matrix, you need to first initialise the CountVectorizer class with the required configuration and then apply fit_transform to actually create the matrix.

Since most cells contain zeros, the result will be in the form of a sparse matrix to save memory.

If you want to materialize it in a 2D array format, call the todense() method of the sparse matrix like its done in the next step.

In [14]:
data_lemmatized = " ".join

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='portuguese',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{2,}',  # num chars > 2
                             # max_features=50000,             # max number of uniq words
                            )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

ValueError: not a built-in stop list: portuguese