In [30]:
# Imports section
import json
import nltk
import pandas as pd

In [31]:
def get_data():
    '''Open and read dataset files, saving contents as raw data.'''
    with open('dataset/jairbolsonaro.json','r') as file_01:
        data_01=json.load(file_01)
    with open('dataset/LulaOficial.json','r') as file_02:
        data_02=json.load(file_02)
        
    return data_01,data_02

In [32]:
def extract_tweets():
    '''Store tweets in lists without metadata'''
    data_01,data_02 = get_data()
    bolso_tweets = []
    for el in data_01:
        bolso_tweets.append(el['full_text'])

    lula_tweets = []
    for el in data_02:
        lula_tweets.append(el['full_text'])
        
    return bolso_tweets, lula_tweets

In [33]:
# Function to compare tweets based on a subject
def compare(subject):
    '''Compare tweets from both sides based on a subject'''
    bolso_tweets,lula_tweets = extract_tweets()
    # Look into Bolsonaro's tweets for the subject and store tweets in a list
    b_selected = []
    for tweet in bolso_tweets:
        if subject.lower() in tweet.lower():
            b_selected.append(tweet)

    # Look into Lula's tweets for the subject and store tweets in a list
    l_selected = []
    for tweet in lula_tweets:
        if subject.lower() in tweet.lower():
            l_selected.append(tweet)

    # Create two series based on the selected tweets lists
    b = pd.Series(b_selected,name='Bolsonaro')
    l = pd.Series(l_selected,name='Lula')

    # Create dataframe concatenating the two series
    pd.set_option('display.max_colwidth', None)
    df=pd.concat([b,l],axis=1)
    
    return df

In [37]:
def clean(tweet):
    '''Remove stopwords from tweet'''
    # nltk.download('stopwords')
    stopwords = nltk.corpus.stopwords.words('portuguese')
    for word in tweet:
        if word in stopwords:
            tweet.remove(word)
    return tweet

In [44]:
def count_words(tweets,min=0):
    '''Count the ocurrence of key words in the tweets'''
    words={}
    selected_words=[]
    for tweet in tweets:
        for word in clean(tweet.split()):
            if word in words.keys():
                words[word] += 1
            else:
                words[word] = 1
    # Filter the words into a list based on a minimum number of ocurrences
    for word, ocurrence in words.items():
        if ocurrence > min:
            selected_words.append(word)

    return selected_words

In [45]:
bolso_tweets, lula_tweets = extract_tweets()
count_words(bolso_tweets)

['EM',
 '"DITADURA"',
 'SEM',
 'PAREDÃO,',
 'ATÉ',
 'CHICO',
 'ALENCAR',
 'É',
 'VALENTÃO.',
 'Mostrem',
 'povo',
 'marcas',
 'tortura.',
 'Vcs',
 'd',
 'bolso',
 'cheio',
 'Bolsa-Ditadura.',
 'Bom',
 'dia!',
 '🇧🇷',
 '#tbt',
 'o',
 'amigo',
 '"Canguru",',
 'já',
 'deixou',
 'plano.',
 'Um',
 'forte',
 'abraço',
 'todos!',
 'https://t.co/fNPQUTBnfi',
 '-',
 'Para',
 'descontrair.',
 'Proibido',
 'queimar',
 'ovo.',
 '(Kkkk)',
 'https://t.co/dmqM8DT8Fz',
 'Trecho',
 'entrevista',
 'vivo',
 'Jornal',
 'Nacional',
 '(08/10/2018):',
 'https://t.co/GFRZznKrZS',
 'Querem',
 'criar',
 'fundão',
 'bilionário',
 'Reforma',
 'Política',
 'alegam',
 'ter',
 'dinheiro',
 'aplicabilidade',
 'plena',
 'voto',
 'impresso!',
 'Contem',
 'outra!',
 'Por',
 '@Rconstantino',
 ':',
 'O',
 'maior',
 'investimento',
 'financeiro',
 'governo',
 'Petista',
 'em...',
 'Cuba.',
 'E',
 'c/',
 'sigilo.',
 'http://t.co/HHHaCOKd7r',
 '@guilhermesousa',
 'Kkkkkkk...',
 'tá',
 'ok,',
 'roteirista',
 'tv',
 'globo.',
 