# Membros do Grupo
* Daniele Montenegro da Silva Barros
* Rodrigo Dantas da Silva
* Thiago Bruschi Martins


# Pacote de dados


*   Abra o link do pacote de dados: https://tinyurl.com/bd10-tweets
*   Selecione a opção "Adicionar Atalho ao Drive"

In [1]:
!pip install mrjob nltk twython

Collecting mrjob
  Downloading mrjob-0.7.4-py2.py3-none-any.whl (439 kB)
[?25l[K     |▊                               | 10 kB 24.3 MB/s eta 0:00:01[K     |█▌                              | 20 kB 30.0 MB/s eta 0:00:01[K     |██▎                             | 30 kB 16.4 MB/s eta 0:00:01[K     |███                             | 40 kB 11.0 MB/s eta 0:00:01[K     |███▊                            | 51 kB 4.5 MB/s eta 0:00:01[K     |████▌                           | 61 kB 4.7 MB/s eta 0:00:01[K     |█████▏                          | 71 kB 4.5 MB/s eta 0:00:01[K     |██████                          | 81 kB 5.1 MB/s eta 0:00:01[K     |██████▊                         | 92 kB 4.9 MB/s eta 0:00:01[K     |███████▌                        | 102 kB 4.2 MB/s eta 0:00:01[K     |████████▏                       | 112 kB 4.2 MB/s eta 0:00:01[K     |█████████                       | 122 kB 4.2 MB/s eta 0:00:01[K     |█████████▊                      | 133 kB 4.2 MB/s eta 0:00:01[

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [201]:
%%file trab2.py

from mrjob.job import MRJob
from mrjob.step import MRStep
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from mrjob.protocol import TextProtocol
import re
import logging
import sys

class Sentiment(MRJob):
  MRJob.SORT_VALUES = True

  def steps(self):
    return [
            MRStep(mapper=self.mapper, 
                   mapper_init=self.mapper_init,
                   combiner=self.combiner, 
                   reducer=self.reducer),
            MRStep(reducer=self.reducer_ordenado) # Passo extra para ordenar os dados
    ]

  def mapper_init(self):
    nltk.download('vader_lexicon', quiet=True)
    nltk.download('punkt', quiet=True)
    self.sentiment = SentimentIntensityAnalyzer()

  def mapper(self, _, line):
    campos = line.split(',') # Separando os campos da linha do arquivo

    if campos[2] == 'False': # Verificando se quem enviou foi a marca
      frases = nltk.sent_tokenize(campos[4]) # separando as frases do arquivo

      for f in frases:
        filtrada = re.sub('[^A-Za-z ]+', '', f) # Removendo caracteres especiais e emojis das frases
        yield campos[1], [self.sentiment.polarity_scores(filtrada)['compound'], 1] # Gera uma saidado tipo: marca, [nota, 1]

  def combiner(self, key, values):    
    pontuacao = 0
    numero_frases = 0

    # Para cada linha temos uma marca, uma pontuacao e o numero de frases que somaram aquela pontuacao (por enquanto, 1)
    # Neste for vamos agrupar e somar o valor de cada marca em cada combiner
    for v in values:    
      pontuacao += v[0] 
      numero_frases += v[1]
    
    yield key, [pontuacao, numero_frases]

  def reducer(self, key, values):
    pontuacao = 0
    numero_frases = 0

    # Agora vamos agrupar e somar a pontuacao de todas as frases de cada marca
    for v in values:
      pontuacao += v[0]
      numero_frases += v[1]

    # Calculando a media da pontuacao das frases de cada marca. 
    # Somamos um para evitar problemas na ordenacao
    media = round(pontuacao / numero_frases, 3) + 1

    # retornamos em ordem invertida pois o MRJob ira ordenar pelo primeiro valor ordenado
    yield media, key
  
  def reducer_ordenado(self, key, values):
    # Agora apenas retiramos o 1 que haviamos somado 
    # E retornamos os valores na ordem correta: Marca, pontuacao media
    for v in values:
      yield v, round(key-1, 3)

if __name__ == '__main__':
    logging.basicConfig(filename="log.txt")
    Sentiment.run()

Overwriting trab2.py


In [202]:
!rm -f log.txt 
!rm temp1/* -f
!python trab2.py "/content/drive/My Drive/Unicamp/MDC/BigData/Trabalho2/twitter_cs/sample_clean.csv" --output-dir=temp1

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/trab2.root.20211107.174504.509617
Running step 2 of 2...
job output is in temp1
Removing temp directory /tmp/trab2.root.20211107.174504.509617...


In [204]:
!cat temp1/*

"Ask_Spectrum"	-0.411
"comcastcares"	-0.061
"British_Airways"	0.063
"sprintcare"	0.063
"ChaseSupport"	0.106
"UPSHelp"	0.113
"VirginTrains"	0.133
"SouthwestAir"	0.151
"Tesco"	0.167
"AppleSupport"	0.192
"SpotifyCares"	0.243
"HPSupport"	0.253
"O2"	0.33


In [203]:
!cat log.txt

INFO:mrjob.conf:No configs found; falling back on auto-configuration
INFO:mrjob.sim:Running step 1 of 2...
INFO:mrjob.runner:Creating temp directory /tmp/trab2.root.20211107.174504.509617
INFO:mrjob.sim:Running step 2 of 2...
INFO:mrjob.runner:job output is in temp1
INFO:mrjob.runner:Removing temp directory /tmp/trab2.root.20211107.174504.509617...


In [205]:
# Rodando o arquivo completo
!python trab2.py "/content/drive/My Drive/Unicamp/MDC/BigData/Trabalho2/twitter_cs/twcs_clean.csv.gz" --output-dir=temp1

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/trab2.root.20211107.174813.039569
Running step 2 of 2...
job output is in temp1
Removing temp directory /tmp/trab2.root.20211107.174813.039569...


In [206]:
!cat temp1/*

"KFC_UKI_Help"	-0.018
"LondonMidland"	0.003
"SW_Help"	0.01
"GWRHelp"	0.012
"nationalrailenq"	0.013
"TacoBellTeam"	0.036
"askvisa"	0.054
"JackBox"	0.058
"MTNC_Care"	0.065
"USCellularCares"	0.065
"VirginTrains"	0.079
"VerizonSupport"	0.083
"AskSeagate"	0.086
"sainsburys"	0.087
"ChipotleTweets"	0.088
"AmazonHelp"	0.093
"VirginAtlantic"	0.094
"YahooCare"	0.095
"ArgosHelpers"	0.098
"NikeSupport"	0.098
"VirginAmerica"	0.101
"NortonSupport"	0.102
"GreggsOfficial"	0.108
"BoostCare"	0.109
"Safaricom_Care"	0.109
"British_Airways"	0.11
"hulu_support"	0.11
"GoDaddyHelp"	0.113
"Postmates_Help"	0.115
"ATVIAssist"	0.119
"CoxHelp"	0.123
"PearsonSupport"	0.125
"UPSHelp"	0.125
"idea_cares"	0.126
"PandoraSupport"	0.129
"AskPlayStation"	0.13
"NeweggService"	0.131
"TfL"	0.131
"AskLyft"	0.132
"JetBlue"	0.133
"XboxSupport"	0.134
"AzureSupport"	0.136
"AskeBay"	0.138
"AsurionCares"	0.14
"Morrisons"	0.141
"sprintcare"	0.141
"CarlsJr"	0.142
"O2"	0.142
"OfficeSupport"	0.142
"asksalesforce"	0.143
"DunkinDonuts"	0.

In [None]:
!python trab2.py -r local "/content/drive/My Drive/twitter_cs/sample_clean_larger.csv.gz" 