<a href="https://colab.research.google.com/github/userddssilva/ESTCMP246-mineracao-de-dados/blob/main/mineracao_de_dados_trabalho_1_extra%C3%A7%C3%A3o_de_aspectos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementação de regras para Extração de Aspectos em Sentenças

Universidade do Estado do Amazonas

Escola Superior de Tecnologia 

Disciplina de Mineração de dados


- Dayvson dos Santos Silva - ddss.snf19@uea.edu.br
- Bonifacio Leite De Oliveira Filho - bldof.eng16@uea.edu.br


## Instalandos as *libs* usadas no processo de extração

In [1]:
# installing https://spacy.io/
!pip install --upgrade spacy 
!python -m spacy
!python -m spacy download pt_core_news_lg
!pip install senticnet
!python setup.py install
!python -m spacy download en_core_web_lg

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 5.2MB/s 
[?25hCollecting srsly<3.0.0,>=2.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/c3/84/dfdfc9f6f04f6b88207d96d9520b911e5fec0c67ff47a0dea31ab5429a1e/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456kB)
[K     |████████████████████████████████| 460kB 34.2MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.4
  Downloading https://files.pythonhosted.org/packages/8d/67/d4002a18e26bf29b17ab563ddb55232b445ab6a02f97bf17d1345ff34d3f/spacy_legacy-3.0.5-py2.py3-none-any.whl
Collecting thinc<8.1.0,>=8.0.3
[?25l  Downloading https://files.pythonhosted.org/packages/61/87/decceba68a0c6ca356ddcb6aea8b2500e71d9bc187f148aae19b747b7d3c/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB

## Importando as *libs* para a realização do trabalho 

In [2]:
import re
import pathlib
import glob

import pt_core_news_lg
import en_core_web_lg
import spacy

import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET

from spacy import displacy
from senticnet.senticnet import SenticNet
from senticnet.babelsenticnet import BabelSenticNet

# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

## Baixando o *dataset* do Google Drive

In [3]:
# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
# 
# Link file: https://drive.google.com/file/d/1xZ71zKdaflWO0mXNTzQEhXNdCcOEbZh3/view?usp=sharing
# Id file: 1xZ71zKdaflWO0mXNTzQEhXNdCcOEbZh3

file_id = '1xZ71zKdaflWO0mXNTzQEhXNdCcOEbZh3'
file_name = 'dataset.xml'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile(filename=file_name)
print('Downloaded file "{}"'.format(file_name))

# Second alternative
# !gdown --id link_id_file

Downloaded file "dataset.xml"


## Carregando o pacote de núcleo do spacy em português

In [4]:
nlp = pt_core_news_lg.load()

## Exemplo de *plot* de dependências

In [None]:
sentence = "A câmera é boa."
doc = nlp(sentence)
displacy.render(doc, style='dep', jupyter=True)

In [None]:
sentence = "o pobre manual"
doc = nlp(sentence)
displacy.render(doc, style='dep', jupyter=True)

## Importando o dataset do XML

Usando a [lib de xml](https://docs.python.org/3/library/xml.etree.elementtree.html) para trabalhar com o *dataset* que por sua vez está em xml.

In [5]:
tree = ET.parse('dataset.xml')
dataset = tree.getroot()

Visualizando as tags que o arquivo contém

In [6]:
dataset.tag

'reviews'

Visualizando os atributos que o arquivo possui

In [None]:
len(dataset[0])

6

In [None]:
len(dataset)

350

In [None]:
print(dataset[0][0].tag)
print(dataset[0][1].tag)
print(dataset[0][2].tag)

sentence
opinion
opinion


## Extraíndo do dataset todas as sentenças não factuais e seus respectivos aspectos

**Passo 1:** Cria-se uma estrutura de dados contento o número da sentença e seus respectos aspectos em formato de lista, caso a sentença seja factual então a lista de aspectos é vazia

In [17]:
# Get all sentences and yours respective aspects
new_dataset = {}
review_id = 0
k = 0
while review_id < len(dataset):
    review_line = 0
    while review_line < len(dataset[review_id]):
        review = dataset[review_id][review_line]
        # Check if review is a sentence
        if review.tag == 'sentence':
            sentence_text = review.text.replace('\"', '')
            new_dataset[k] = (sentence_text, []) # tuple -> (sentence, [aspects...])
            k += 1
        else:
            # get aspect from review
            aspect = re.search(r'\"([\w\s]+)\"', review.text).group(1)
            if aspect is not None:
                new_dataset[k-1][1].append(aspect) # add aspects of preview sentence 
        review_line += 1
    review_id += 1 

**Passo 2:** Remove-se todas as sentenças factuais, ou seja, as que possuem a lista de aspectos vazios

In [37]:
# Number sentences
len(new_dataset)

1049

In [None]:
sentences = []
aspects = set()

for number,(sentence, aspect_list) in new_dataset.items():
    if len(aspect_list) > 0:
        sentences.append(sentence)
        aspects.update(aspect_list)

In [42]:
# Number sentences, before drop factuals sentences
len(sentences)

591

In [None]:
new_dataset = {}
for id_review in range(len(dataset)):
    i = 0
    k = 1
    size_lines = len(dataset[id_review])
    new_dataset[id_review] = {}

    while i < size_lines:
        sentence = dataset[id_review][i]

        if sentence.tag == 'sentence':
            new_dataset[id_review][k] = []
            new_dataset[id_review][k].append(sentence.text.replace('\"', ""))
            k += 1
        else:
            aspect = re.search(r'\"([\w\s]+)\"', sentence.text).group(1)
            new_dataset[id_review][k-1].append(aspect)
        i += 1
    print(id_review, new_dataset[id_review])

0 {1: ['Ambiente agradável, bons pratos, comida saborosa , bom serviço, vale a pena conferir, bom tempero, recomendo', 'Ambiente', 'pratos', 'comida', 'serviço', 'tempero']}
1 {1: ['Lugar muito agradável, comida excelente é importante o bom atendimento dos garçons.', 'comida', 'Lugar', 'atendimento dos garçons'], 2: ['Recomendo.'], 3: ['Pedimos um pirarucu ao molho de tucupi e não arrependemos']}
2 {1: ['O Coqueiro verde serve uma deliciosa carne de sol, também contam com serviço de entrega o que é uma boa pedida para quem não quer sair de casa.. O serviço no restaurante é simples, a comida também, mas muito gostosa..', 'carne de sol', 'comida', 'serviço', 'serviço de entrega']}
3 {1: ['Ambiente confortável, ótimo atendimento com um garçon simpático, ele nos contou a história do empreendimento e sempre bem solícito.', 'Ambiente', 'atendimento', 'garçon', 'garçon'], 2: ['Comida Boa por um bom preço.', 'Comida', 'preço'], 3: ['O restaurante fica dentro do hotel Vila Amazônia']}
4 {1: ['B

## Implementação das regras 

Carregando o core de português

In [None]:
npl = pt_core_news_lg.load()

A primeira regra escolhida, foi a regra 7 **amod(NN,OP)** da tabela 3

In [None]:
def rule_7(doc):
    aspects = []
    for i in doc:
        if i.dep_ == 'amod':
            if i.head.pos_ == 'NOUN':
                aspects.append(str(i.head))
    return aspects


sentence = 'Lugar especialmente agradável, atendimento excepcional e a comida deliciosa.'
doc = nlp(sentence)
rule_7(doc)

['Lugar', 'atendimento', 'comida']

In [None]:
displacy.render(doc, style='dep', jupyter=True)

A próxima regra escolhida foi a 23 da tabela 4

In [None]:
def rule_23(doc):
    aspects = []
    for index in range(len(doc)-1):
        if doc[index].pos_ == 'NOUN' and doc[index+1].pos_ == 'ADJ':
            aspects.append(str(doc[index]))
    return aspects

sentence = 'Comida simples e barata, para quem esta de passagem pelo centro de João Pessoa, eu recomendo.'
doc = nlp(sentence)
rule_23(doc)

['Comida']

In [None]:
displacy.render(doc, style='dep', jupyter=True)

Seguido pela regra 60 tabela 5

In [None]:
def rule_60(doc):
    aspects = []
    for index in range(len(doc)-3):
        if (doc[index].pos_ == 'DET') and (doc[index+1].pos_ == 'NOUN') and (doc[index+2].pos_ != 'NOUN' )and (doc[index+3].pos_ == 'ADJ'):
            aspects.append(str(doc[index+1]))
    return aspects

nlp = en_core_web_lg.load()
# their customer service is very poor.
sentence = 'this camera is closest to perfect'
doc = nlp(sentence)
rule_60(doc)

['camera']

In [None]:
displacy.render(doc, style='dep', jupyter=True)

In [None]:
def extract_sentences_and_aspects():
    aspects = []
    sentences = []
    for row in new_dataset.values():
        for sentence_dict in row.values():
            if len(sentence_dict) > 1:
                sentences.append(sentence_dict[0])
                aspects.append(sorted(sentence_dict[1:]))
            else:
                sentences.append(sentence_dict[0])
                aspects.append([])
    return sentences, aspects

sentences, real_aspects = extract_sentences_and_aspects()

In [None]:
def apply_rule(sentences, rule):
    predicted_aspects = []
    for sentence in sentences:
        predicted_aspects.append(sorted(rule(nlp(sentence))))
    return predicted_aspects

In [None]:
nlp = pt_core_news_lg.load()
rule_7_aspects = apply_rule(sentences, rule_7)
rule_23_aspects = apply_rule(sentences, rule_23)
rule_60_aspects = apply_rule(sentences, rule_60)

In [None]:
def confusion_matrix_generator(predicted_aspects, real_aspects):
    TN=FP=FN=TP=0
    for predicted, real in zip(predicted_aspects, real_aspects):
      if predicted == real:
        TP += len(predicted)
      elif predicted > real:
        FP += len([false_positive for false_positive in predicted if false_positive not in real])
      elif predicted < real:
        FN += len([false_negative for false_negative in real if false_negative not in predicted])
    return [[TN, FP],[FN, TP]]

In [None]:
rule_7_confusion_matrix = confusion_matrix_generator(rule_7_aspects, real_aspects)
rule_23_confusion_matrix = confusion_matrix_generator(rule_23_aspects, real_aspects)
rule_60_confusion_matrix = confusion_matrix_generator(rule_60_aspects, real_aspects)

In [None]:
def precision(confusion_matrix):
    """
    It calculates the precision metric
    :param list confusion_matrix: confusion matrix, needs to be at following pattern
    [[True Negative, False Positive],
    [False Negativem, True positive]]
    """
    return confusion_matrix[1][1] / (confusion_matrix[1][1] + confusion_matrix[0][1])

In [None]:
def recall(confusion_matrix):
    """
    It calculates the recall metric
    :param list confusion_matrix: confusion matrix, needs to be at following pattern
    [[True Negative, False Positive],
     [False Negativem, True positive]]
    """
    return confusion_matrix[1][1] / (confusion_matrix[1][1] + confusion_matrix[1][0])

In [None]:
def f1_score(confusion_matrix):
    """
    It calculates the F1-Score metric
    :param list confusion_matrix: confusion matrix, needs to be at following pattern
    [[True Negative, False Positive],
     [False Negativem, True positive]]
    """
    preci = precision(confusion_matrix)
    rec = recall(confusion_matrix)
    return 2*((preci * rec)/(preci + rec))

# Metrics for Rule 7

In [None]:
print('Precision: ', precision(rule_7_confusion_matrix))
print('Recall: ', recall(rule_7_confusion_matrix))
print('F1-Score: ', f1_score(rule_7_confusion_matrix))

Precision:  0.2970873786407767
Recall:  0.27079646017699116
F1-Score:  0.2833333333333334


# Metrics for Rule 23

In [None]:
print('Precision: ', precision(rule_23_confusion_matrix))
print('Recall: ', recall(rule_23_confusion_matrix))
print('F1-Score: ', f1_score(rule_23_confusion_matrix))

Precision:  0.22813688212927757
Recall:  0.08862629246676514
F1-Score:  0.12765957446808512


# Metrics for Rule 60

In [None]:
print('Precision: ', precision(rule_60_confusion_matrix))
print('Recall: ', recall(rule_60_confusion_matrix))
print('F1-Score: ', f1_score(rule_60_confusion_matrix))

Precision:  0.5081967213114754
Recall:  0.03471444568868981
F1-Score:  0.06498951781970651
