# Treat data

In order to get a better model, we will restrict our data to:
- only political news
- from 2017

This script filters this data from all the documents in the dataset and pre-process the text for the topics-over-time discovery.

In [106]:
import pandas as pd

In [107]:
file_csv = "../02 - Analise exploratoria/all_together.csv"
df = pd.read_csv(file_csv)

df.head()

Unnamed: 0,label,file,link,category,emotiveness,errors,pausatility,text,tokens,words in upper case,average sentence length,nouns,verbs,adverbs,adjectives,pronouns,date
0,fake,1.txt,ceticismopolitico.com,politica,0.263158,0.0,2.0,"katia abreu diz vai colocar expulsao moldura, ...",211.0,6.0,14.2308,46.0,30.0,13.0,7.0,26.0,2017-11-30
1,fake,10.txt,ceticismopolitico.com,politica,0.241667,0.007874,2.5,"dr. ray peita bolsonaro, chama-o conservador f...",289.0,0.0,18.1429,64.0,56.0,18.0,11.0,20.0,2017-11-24
2,fake,100.txt,afolhabrasil.com.br,politica,0.12782,0.003636,1.8125,reinaldo azevedo desmascarado policia federal....,304.0,0.0,17.1875,88.0,45.0,8.0,9.0,18.0,2017-05-23
3,fake,1000.txt,diariodobrasil.org,politica,0.229008,0.001748,2.68,relatorio assustador bndes mostra dinheiro pub...,639.0,14.0,22.88,175.0,87.0,21.0,39.0,34.0,2017-07-24
4,fake,1001.txt,diariodobrasil.org,politica,0.269231,0.0,0.894737,"radialista americano fala sobre pt: ""eles vend...",128.0,1.0,5.84211,31.0,21.0,8.0,6.0,12.0,2017-07-25


In [108]:
print("All categories:", set(df["category"]))

df_politica = df.loc[df["category"] == "politica"].copy()
print(f"Ratio: {len(df_politica)}/{len(df)}")

df_politica.head()

All categories: {'sociedade_cotidiano', 'ciencia_tecnologia', 'politica', 'economia', 'tv_celebridades', 'religiao'}
Ratio: 4180/7200


Unnamed: 0,label,file,link,category,emotiveness,errors,pausatility,text,tokens,words in upper case,average sentence length,nouns,verbs,adverbs,adjectives,pronouns,date
0,fake,1.txt,ceticismopolitico.com,politica,0.263158,0.0,2.0,"katia abreu diz vai colocar expulsao moldura, ...",211.0,6.0,14.2308,46.0,30.0,13.0,7.0,26.0,2017-11-30
1,fake,10.txt,ceticismopolitico.com,politica,0.241667,0.007874,2.5,"dr. ray peita bolsonaro, chama-o conservador f...",289.0,0.0,18.1429,64.0,56.0,18.0,11.0,20.0,2017-11-24
2,fake,100.txt,afolhabrasil.com.br,politica,0.12782,0.003636,1.8125,reinaldo azevedo desmascarado policia federal....,304.0,0.0,17.1875,88.0,45.0,8.0,9.0,18.0,2017-05-23
3,fake,1000.txt,diariodobrasil.org,politica,0.229008,0.001748,2.68,relatorio assustador bndes mostra dinheiro pub...,639.0,14.0,22.88,175.0,87.0,21.0,39.0,34.0,2017-07-24
4,fake,1001.txt,diariodobrasil.org,politica,0.269231,0.0,0.894737,"radialista americano fala sobre pt: ""eles vend...",128.0,1.0,5.84211,31.0,21.0,8.0,6.0,12.0,2017-07-25


In [109]:
df_use = df_politica.dropna(subset=["date"], how="all").copy()
# Easier to do as a string
df_2017 = df_use.loc[df["date"].str.startswith("2017", na=False)].copy()

print(f"Ratio: {len(df_2017)}/{len(df)} ({len(df_2017)/len(df):.2f})")
print(f"True: {len(df_2017.loc[df['label'] == 'true'])}")
print(f"Fake: {len(df_2017.loc[df['label'] == 'fake'])}")
df_2017.head()

Ratio: 1718/7200 (0.24)
True: 936
Fake: 782


Unnamed: 0,label,file,link,category,emotiveness,errors,pausatility,text,tokens,words in upper case,average sentence length,nouns,verbs,adverbs,adjectives,pronouns,date
0,fake,1.txt,ceticismopolitico.com,politica,0.263158,0.0,2.0,"katia abreu diz vai colocar expulsao moldura, ...",211.0,6.0,14.2308,46.0,30.0,13.0,7.0,26.0,2017-11-30
1,fake,10.txt,ceticismopolitico.com,politica,0.241667,0.007874,2.5,"dr. ray peita bolsonaro, chama-o conservador f...",289.0,0.0,18.1429,64.0,56.0,18.0,11.0,20.0,2017-11-24
2,fake,100.txt,afolhabrasil.com.br,politica,0.12782,0.003636,1.8125,reinaldo azevedo desmascarado policia federal....,304.0,0.0,17.1875,88.0,45.0,8.0,9.0,18.0,2017-05-23
3,fake,1000.txt,diariodobrasil.org,politica,0.229008,0.001748,2.68,relatorio assustador bndes mostra dinheiro pub...,639.0,14.0,22.88,175.0,87.0,21.0,39.0,34.0,2017-07-24
4,fake,1001.txt,diariodobrasil.org,politica,0.269231,0.0,0.894737,"radialista americano fala sobre pt: ""eles vend...",128.0,1.0,5.84211,31.0,21.0,8.0,6.0,12.0,2017-07-25


# Pre-process text

Before running topics discovery, we must pre-process the documents texts.
To do that, we apply the same process done for LDA.

In [110]:
import gensim
import spacy
import stopwordsiso

df_filtered = df_2017.copy()

# 1. Tokenize and Clean-up using gensim’s simple_preprocess()

In [111]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))


all_documents = list(df_filtered["text"].copy())
data_words = list(sent_to_words(all_documents))
data_words[0]

['katia',
 'abreu',
 'diz',
 'vai',
 'colocar',
 'expulsao',
 'moldura',
 'reclamar',
 'senadora',
 'katia',
 'abreu',
 'sem',
 'partido',
 'to',
 'disse',
 'expulsao',
 'pmdb',
 'resultado',
 'acao',
 'cupula',
 'atual',
 'legenda',
 'que',
 'segundo',
 'ela',
 'oportunista',
 'amanha',
 'vou',
 'botar',
 'moldura',
 'dourada',
 'expulsao',
 'porque',
 'maos',
 'onde',
 'veio',
 'atestado',
 'boa',
 'conduta',
 'curriculo',
 'essas',
 'pessoas',
 'expulsaram',
 'servem',
 'pais',
 'eles',
 'servem',
 'pais',
 'beneficios',
 'proprios',
 'disse',
 'katia',
 'abreu',
 'ue',
 'expulsao',
 'algo',
 'tao',
 'bom',
 'curriculo',
 'tanta',
 'choradeira',
 'katia',
 'sabemos',
 'motivo',
 'provavelmente',
 'katia',
 'valor',
 'pt',
 'partido',
 'deveria',
 'te',
 'la',
 'absorvido',
 'ao',
 'parece',
 'pt',
 'gostava',
 'katia',
 'somente',
 'ficasse',
 'entrincheirada',
 'dentro',
 'pmdb',
 'ou',
 'seja',
 'rebaixar',
 'demais',
 'resta',
 'katia',
 'ficar',
 'chorando',
 'pitangas',
 'todos

# 2. Lemmatization

Lemmatization is a process where we convert words to its root word.

For example: ‘Studying’ becomes ‘Study’, ‘Meeting becomes ‘Meet’, ‘Better’ and ‘Best’ becomes ‘Good’.

The advantage of this is, we get to reduce the total number of unique words in the dictionary. As a result, the number of columns in the document-word matrix (created by CountVectorizer in the next step) will be denser with lesser columns.

## If you're unable to run the cell below, make sure you have pt_core_news_sm from spacy.

### To download it, run `python -m spacy download pt_core_news_sm`

In [112]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            " ".join(
                [
                    token.lemma_ if token.lemma_ not in ["-PRON-"] else ""
                    for token in doc
                    if token.pos_ in allowed_postags
                ]
            )
        )
    return texts_out


# Initialize spacy 'pt_core_news_sm' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download pt_core_news_sm
nlp = spacy.load("pt_core_news_sm", disable=["parser", "ner"])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(
    data_words, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]
)
data_lemmatized[0]

'katia abreu dizer colocar expulsao moldurar reclamar abreu partir dizer expulsao pmdb resultar cupula atual legendar oportunista amanhar botar moldurar dourar expulsao maos vir atestar bom conduta curriculo pessoa expulsar servir pai servir pai beneficios proprios dizer katia abreu expulsao bom curriculo tanto choradeira katia saber motivar provavelmente katia valor pt partir dever absorver parecer gostar katia somente ficar entrincheirar dentro pmdb rebaixar demais restar katia ficar chorar pitanga canto momento cadastrar abreu fileira situacao patetica ministro agricultura'

## Remove stopwords

In [113]:
stops = set(stopwordsiso.stopwords("pt"))


def remove_stopwords(text: str) -> str:
    # print(text)
    return " ".join([t for t in text.split(" ") if t not in stops])


data_no_stopwords = [remove_stopwords(text) for text in data_lemmatized]
data_no_stopwords[0], data_no_stopwords[-1]

('katia abreu colocar expulsao moldurar reclamar abreu expulsao pmdb resultar cupula atual legendar oportunista amanhar botar moldurar dourar expulsao maos vir atestar conduta curriculo pessoa expulsar servir pai servir pai beneficios proprios katia abreu expulsao curriculo choradeira katia motivar provavelmente katia pt dever absorver parecer gostar katia ficar entrincheirar pmdb rebaixar restar katia ficar chorar pitanga canto cadastrar abreu fileira situacao patetica ministro agricultura',
 'senado aprovar turno foro privilegiar politicos autoridade proposto foro caso autoridade cometer crime comum roubar corrupcao texto preciso passar votacao senado antar camara senado aprovar feirar turno proposto emendar constituicao pec acabar prerrogativa foro autoridade caso praticar crime comum roubar corrupcao pec aprovar voto votar contrariar tratar alteracao texto constitucional proposto preciso passar segundar turno votacao senado antar seguir camara deputar precisar analisar votacoes lei

# 3. Update DataFrame

In [114]:
import copy

for i in range(len(df_filtered)):
    df_filtered.iloc[i, df.columns.get_loc("text")] = data_no_stopwords[i]
df_filtered

Unnamed: 0,label,file,link,category,emotiveness,errors,pausatility,text,tokens,words in upper case,average sentence length,nouns,verbs,adverbs,adjectives,pronouns,date
0,fake,1.txt,ceticismopolitico.com,politica,0.263158,0.000000,2.000000,katia abreu colocar expulsao moldurar reclamar...,211.0,6.0,14.23080,46.0,30.0,13.0,7.0,26.0,2017-11-30
1,fake,10.txt,ceticismopolitico.com,politica,0.241667,0.007874,2.500000,ray peitar bolsonaro chamar conservador entrev...,289.0,0.0,18.14290,64.0,56.0,18.0,11.0,20.0,2017-11-24
2,fake,100.txt,afolhabrasil.com.br,politica,0.127820,0.003636,1.812500,desmascarar policiar federal ferrenho criticar...,304.0,0.0,17.18750,88.0,45.0,8.0,9.0,18.0,2017-05-23
3,fake,1000.txt,diariodobrasil.org,politica,0.229008,0.001748,2.680000,relatorio assustador mostrar dinheiro publicar...,639.0,14.0,22.88000,175.0,87.0,21.0,39.0,34.0,2017-07-24
4,fake,1001.txt,diariodobrasil.org,politica,0.269231,0.000000,0.894737,radialista americano falir vender ilusao brasi...,128.0,1.0,5.84211,31.0,21.0,8.0,6.0,12.0,2017-07-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7173,true,975.txt,g1.globo.com,politica,0.316493,0.001933,2.670210,gracas internet facilitar odiar karnal histori...,1803.0,11.0,16.51060,405.0,268.0,99.0,114.0,147.0,2017-01-27
7182,true,983.txt,g1.globo.com,politica,0.281106,0.005096,2.963300,mpf querer ouvir dilma mantega contar joesley ...,1893.0,9.0,14.40370,324.0,327.0,145.0,38.0,187.0,2017-06-30
7183,true,984.txt,g1.globo.com,politica,0.253707,0.001462,3.014080,temer falir reducao desmatamento amazonia refo...,1582.0,13.0,19.26760,391.0,216.0,55.0,99.0,73.0,2017-09-19
7186,true,987.txt,g1.globo.com,politica,0.208092,0.001287,2.951220,cortar verba pf afeta lavar jato afirmar coord...,1796.0,29.0,18.95120,435.0,257.0,79.0,65.0,70.0,2017-07-28


In [115]:
df_filtered.to_csv("./processed_data.csv")