In [None]:
from warnings import simplefilter
simplefilter('ignore')
import pandas as pd, numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import re
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from collections import Counter
import string
from nltk.util import ngrams
from unicodedata import normalize
from datetime import datetime
import json
from pickle import load

In [None]:
with open('jairbolsonaro.json', 'r', encoding='utf8') as json_file:
    words = json.load(json_file)
    
columns_name = []
for i in words[0]:
    columns_name.append(i)

DF = pd.DataFrame(columns=columns_name)
DF = DF[['full_text', 'created_at', 'source', 
         #'geo', 'coordinates', 
         'place', 'retweet_count']]
    

DF['full_text'] = [i['full_text']  for i in words]
DF['created_at'] = [i['created_at'] for i in words]
DF['entities'] = [i['entities'] for i in words]
DF['source'] = [i['source'] for i in words]
#DF['geo'] = [i['geo'] for i in words]
#DF['coordinates'] = [i['coordinates'] for i in words]
DF['place'] = [i['place'] for i in words]
DF['retweet_count'] = [i['retweet_count'] for i in words]
DF['day'] = [str(i)[8:10].strip() for i in pd.to_datetime(DF['created_at'])]
DF['month'] = [str(i)[5:7].strip() for i in pd.to_datetime(DF['created_at'])]
DF['year'] = [str(i)[0:4].strip() for i in pd.to_datetime(DF['created_at'])]
DF['hour'] = [str(i)[10:19].strip() for i in pd.to_datetime(DF['created_at'])]

In [None]:
#retorna o texto limpo em forma de lista
def return_text_clean_split(DF):
    words_list = []
    stopWords = stopwords.words('portuguese')
    list_alphabetic = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 
                       'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 
                       'r', 's', 't', 'u', 'v', 'x', 'z']
    for i in list_alphabetic:
        stopWords.append(i)
        
    for i in DF:
        word_tokens = word_tokenize(str(i))
        for word in word_tokens:
            
            new_words = re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE).replace('http', '').replace('https', '') #Remove links
            new_words = re.sub(r'[^\w]', '', new_words) #Remove espaços _ , () / 
            if new_words.lower().strip() not in stopWords:  #Remove palavras stopWords
                words_list.append(new_words.lower())
                
    #phrase = " ".join(s for s in words_list)
    return words_list



#retorna o texto limpo em forma de string
def return_text_clean(text, stop_words=True):
    words_list = []
    stopWords = stopwords.words('portuguese')
    list_alphabetic = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 
                       'i', 'j', 'l', 'm', 'n', 'o', 'p', 'q', 
                       'r', 's', 't', 'u', 'v', 'x', 'z']
    for i in list_alphabetic:
        stopWords.append(i)
        
    
    word_tokens = word_tokenize(str(text))
    for word in word_tokens:
        
        
        new_words = re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE).replace('http', '').replace('https', '') #Remove links
        new_words = re.sub(r'[^\w]', '', new_words) #Remove espaços _ , () / 
        new_words = re.sub('[^a-zA-Z]+', ' ', (new_words.lower())) #Deixa apenas letras e remove os numeros
        
        if stop_words:
            if new_words.lower().strip() not in stopWords:  #Remove palavras stopWords
                words_list.append(new_words.lower())
        else:
            
            if not new_words.lower().strip() in list_alphabetic: 
                words_list.append(new_words.lower().strip())
            
                
    full_text = " ".join(s for s in words_list)
    return full_text


# Transforma as horas em intervalos.
def get_range_hour(hour):
    
    hour = datetime.strptime(hour, '%H:%M:%S').time()
    return_range = None
    
    #Manhã
    time_str_morning_begin = '06:00:00'
    time_str_morning_end = '11:59:59'

    time_morning_begin = datetime.strptime(time_str_morning_begin, '%H:%M:%S').time()
    time_morning_end = datetime.strptime(time_str_morning_end, '%H:%M:%S').time()

    #Tarde 
    time_str_evening_begin = '12:00:00'
    time_str_evening_end = '17:59:59'

    time_evening_begin = datetime.strptime(time_str_evening_begin, '%H:%M:%S').time()
    time_evening_end = datetime.strptime(time_str_evening_end, '%H:%M:%S').time()

    #Noite 
    time_str_night_begin = '18:00:00'
    time_str_night_end = '23:59:59'

    time_night_begin = datetime.strptime(time_str_night_begin, '%H:%M:%S').time()
    time_night_end = datetime.strptime(time_str_night_end, '%H:%M:%S').time()

    #Madrugada
    time_str_dawn_begin = '00:00:00'
    time_str_dawn_end = '05:59:59'

    time_dawn_begin = datetime.strptime(time_str_dawn_begin, '%H:%M:%S').time()
    time_dawn_end = datetime.strptime(time_str_dawn_end, '%H:%M:%S').time()
    
    #Manha
    if hour >= time_morning_begin and hour <= time_morning_end:
        return_range = 1 

    #Tarde    
    elif hour >= time_evening_begin and hour <= time_evening_end:
        return_range = 2 
        
    #Noite    
    elif hour >= time_night_begin and hour <= time_night_end:
        return_range = 3 

    #Madrugada    
    elif hour >= time_dawn_begin and hour <= time_dawn_end:
        return_range = 4
    
    return return_range



#Retorna a frequencia das palavras
def freq(DF, qtd):
    counts = Counter(return_text_clean_split(DF))
    counts.pop('') #remove os vazios
    return_counts = counts.most_common(qtd) #Top 10
    
    for i, j in return_counts:
        print('Palavras :'+i, '  Frequência :'+str(j))
    print('\n')


#Gera as hashTags mais usadas no dia.     
def generate_hashtags_per_day(DF, show_qtd):
    
    hashtaglist_morning = []
    hashtaglist_evening = []
    hashtaglist_night = []
    hashtaglist_dawn = []
    
    

    for row in range(DF.shape[0]):
        hour = DF['hour'][row]
        full_text = DF['full_text'][row].split()
        
        for words in full_text:
            words = words.lower()
            #Considera uma hashtag se a palavra se iniciar com #
            if words.lower().startswith('#'):
                #Manhã
                if get_range_hour(hour) == 1:
                    hashtaglist_morning.append(words)

                #Tarde    
                elif get_range_hour(hour) == 2:
                    hashtaglist_evening.append(words)

                #Noite    
                elif get_range_hour(hour) == 3:
                    hashtaglist_night.append(words)

                #Madrugada    
                elif get_range_hour(hour) == 4:
                    hashtaglist_dawn.append(words)

    df_morning = pd.DataFrame(columns=['turno', 'hashtags', 'freq'])
    df_evening = pd.DataFrame(columns=['turno', 'hashtags', 'freq'])
    df_night = pd.DataFrame(columns=['turno', 'hashtags', 'freq'])
    df_dawn = pd.DataFrame(columns=['turno', 'hashtags', 'freq'])
    
       
    
    df_morning['hashtags'] = [i[0] for i in Counter(hashtaglist_morning).most_common(show_qtd)]
    df_morning['freq'] = [i[1] for i in Counter(hashtaglist_morning).most_common(show_qtd)]
    df_morning['turno'] = 'manha'
    
    df_evening['hashtags'] = [i[0] for i in Counter(hashtaglist_evening).most_common(show_qtd)]
    df_evening['freq'] = [i[1] for i in Counter(hashtaglist_evening).most_common(show_qtd)]
    df_evening['turno'] = 'tarde'
    
    df_night['hashtags'] = [i[0] for i in Counter(hashtaglist_night).most_common(show_qtd)]
    df_night['freq'] = [i[1] for i in Counter(hashtaglist_night).most_common(show_qtd)]
    df_night['turno'] = 'noite'
    
    df_dawn['hashtags'] = [i[0] for i in Counter(hashtaglist_dawn).most_common(show_qtd)]
    df_dawn['freq'] = [i[1] for i in Counter(hashtaglist_dawn).most_common(show_qtd)]
    df_dawn['turno'] = 'madrugada'
    
   
    df_union = pd.concat([df_morning,
                            df_evening,
                            df_night ,
                            df_dawn ])
    
    
    return df_union #Counter(hashtaglist_morning).most_common(show_qtd)




def untokenize(ngram):
    tokens = list(ngram)
    return "".join([" "+i if not i.startswith("'") and \
                             i not in string.punctuation and \
                             i != "n't"
                          else i for i in tokens]).strip()


def generate_sentences(DF, length_sentence, string_search):
    #Negrito.
    BOLD = '\033[1m'
    END = '\033[0m'
    only_valid_words = re.compile('[A-Za-z]+: (.*)') #Apenas palavras válidas
    w_list = []
    sentence_list = []
      
    for row in range(DF.shape[0]):
        
        text = DF['full_text'][row].split()
        text_list = []
        
            
        for w in text:
            w = return_text_clean(w, False)
            text_list.append(w)
            
        str_text = ' '.join(text_list)
        
        #Deixa a frase de maneira natural, sem acentos e busca a string informada
        if len(re.findall(string_search, normalize('NFKD', str_text).encode('ASCII','ignore').decode('ASCII'))):
            w_list.append(str_text)

    for word in w_list:
        for sent in nltk.sent_tokenize(word):
            strip_speaker = only_valid_words.match(sent)
            if strip_speaker is not None:
                sent = strip_speaker.group(1)
            words = nltk.word_tokenize(sent)
            for phrase in ngrams(words, length_sentence):
                if all(word not in string.punctuation for word in phrase): #Remove pontuações
                    #phrase = re.sub(r'[^A-Za-z0-9]+', ' ', untokenize(phrase)) #apenas palavras válidas
                    phrase = untokenize(phrase) #apenas palavras válidas
                    phrase = phrase.replace('.', '').replace("''", '')
                    if phrase not in '  ':
                        sentence_list.append(phrase)

    phrase_counter = Counter(sentence_list).most_common(20)

    for k,v in phrase_counter:
        print(f'Frequencia: {BOLD}{str(v)}{END} vezes', 
              f'   Tamanho sequencia: {BOLD}{str(length_sentence)}{END} palavras', 
              f'   Sentenças: {BOLD}{k}{END}')
        
        
#Faz a contagem de todos  dispositivos onde foram feito os posts
def count_post_devices(DF):
    iphone_list = []
    android_list = []
    others_list = []
    
    for row in range(DF.shape[0]):

        row = DF['source'][row].lower()

        row = row.split()[-1].replace('</a>', '').replace('rel="nofollow">', '')

        if row == 'iphone':
            iphone_list.append(row)
        elif row == 'android':
            android_list.append(row)
        else:
            others_list.append('others')

    #Obtem a contagem das listas de cada devices
    iphone = Counter(iphone_list)
    android = Counter(android_list)
    others = Counter(others_list)
    
       
    
    list_count = [iphone['iphone'], android['android'], others['others']]
    
    
    d = {'list_devices': ['iphone', 'android', 'others'], 
        'count': list_count,
        'percentage': [(round(i / np.sum(list_count), 2) * 100) for i in list_count]}
    
    
    df_return = pd.DataFrame(d)
    return df_return


#Faz a plotagem da contagem de dispositivos onde foram feitos os posts
def plot_count_posts_devices(DF):    
    x = count_post_devices(DF)[['list_devices', 'percentage']]
    colors= ['blue', 'red', 'green']
    f, ax = plt.subplots(figsize=(18,5))
    
    ax.bar(x['list_devices'], x['percentage'], color=colors, label=colors)
    plt.xticks(fontsize = 11, rotation=45)
    plt.ylabel('Percentage')
    plt.xlabel('list_devices')
    plt.show()


# Faz a contagem dos dispositivos onde o usuário twitou usando algo relacionado a bolsonaro, lula e dilma
def count_post_devices_per_candidate(DF):
    
    candidates = ['bolsonaro', 'lula', 'dilma']

    list_data = []

    for candidate in candidates:
        iphone_list = []
        android_list = []
        others_list = []
        dict_count = {}

        for row in range(DF.shape[0]):

            if len(re.findall(candidate, normalize('NFKD', DF['full_text'][row].lower()).encode('ASCII','ignore').decode('ASCII'))):

                row = DF['source'][row].lower()

                row = row.split()[-1].replace('</a>', '').replace('rel="nofollow">', '')

                if row == 'iphone':
                    iphone_list.append(row)
                elif row == 'android':
                    android_list.append(row)
                else:
                    others_list.append('others')

        #Obtem a contagem das listas de cada devices
        iphone = Counter(iphone_list)
        android = Counter(android_list)
        others = Counter(others_list)

        dict_count['candidate'] = candidate
        dict_count['iphone'] = iphone['iphone']
        dict_count['android'] = android['android']
        dict_count['others'] = others['others']


        list_data.append(dict_count)
    
    #Cria dataframe para posteriormente ser retornado pela função
    d = pd.DataFrame(list_data)

    #Calcula a porcentagem de cada device
    #Calculo da porcentagem: (device / (somatorio de cada devices)) * 100
    iphone_percentage = [str(round(i, 2) * 100) + '%' for i in (d['iphone'] / (d['iphone'] + d['android'] + d['others'])).values]
    android_percentage = [str(round(i, 2) * 100) + '%' for i in (d['android'] / (d['iphone'] + d['android'] + d['others'])).values]
    others_percentage = [str(round(i, 2) * 100) + '%' for i in (d['others'] / (d['iphone'] + d['android'] + d['others'])).values]

    #Anexa as porcentagems no dataframe
    d['iphone_percengate'] = iphone_percentage
    d['android_percentage'] = android_percentage
    d['others_percentage'] = others_percentage
    
    
    d1 = d[['candidate', 'iphone', 'android', 'others']]
    d2 = d[['candidate', 'iphone_percengate', 'android_percentage', 'others_percentage']]
    
    #Transoforma linhas em colunas para linhas. (UNPIVOT)
    d1 = d1.melt(id_vars=['candidate'], var_name='list_devices', value_name='count') 
    d2 = d2.melt(id_vars=['candidate'], var_name='list_percetage', value_name='percetage') 

    d1['percetage'] = d2['percetage']
    
    return d1 


In [None]:
plot_count_posts_devices(DF)