##LOAD PACKAGES

In [None]:
!pip install --upgrade spacy
!python -m spacy
!python -m spacy download pt_core_news_lg
!pip install senticnet
!python setup.py install
!python -m spacy download en_core_web_lg

Collecting spacy
  Downloading spacy-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 4.7 MB/s 
[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 26.7 MB/s 
[?25hCollecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.2 MB/s 
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456 kB)
[K     |████████████████████████████████| 456 kB 65.0 MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.7
  Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting catalogue<2.1.0,>=2.0.4
  Downloading catalogue-2.0.5-py3-none-any.whl (17 kB)
Collecting thinc<8.1.0,>=8.0.8
  Downloading thinc-8.0.8-cp37-cp

## IMPORTING LIBRARIES

In [None]:
import pt_core_news_lg
import en_core_web_lg
import spacy
from spacy import displacy
from senticnet.senticnet import SenticNet
from senticnet.babelsenticnet import BabelSenticNet
import xml.etree.ElementTree as ET
import pandas as pd
import pathlib
import re
import glob
import pandas as pd
import numpy as np

## Example Dependency Parser

In [None]:
nlp = pt_core_news_lg.load()
sentence = "O manual escasso."
doc = nlp(sentence)
aspects = []
bsn = BabelSenticNet('pt') 
# for token in doc:
#     print(token.text, token.tag_, token.head.text, token.dep_, token.pos_)

In [None]:
displacy.render(doc, style='dep', jupyter=True)

## Auxiliar Funcions

References: [3].

* these functions modularize the implementation of common code snippets for some of the following rules
* the OP function was implemented to extract opinion words from a file [3].



In [None]:
def relation_nsubj(doc):
    aux1 = " "
    aux2 = " "
    for i in doc:
        if i.dep_ == "nsubj":
            aux1 = i.text
            aux2 = i.head.text
            break;
    return aux1, aux2

def is_auxiliar(doc):
    auxiliar = 0    
    for i in doc:
        if i.pos_ == "AUX":
            auxiliar = 1
    return auxiliar

def OP():
    arq = open("palavras.txt")
    op = set(line.strip() for line in arq)
    arq.close()
    return op
op = OP()

## Rule 0

References: [1].

0, example: O **livro** é *muito estranho*.

> Trigger: when the active token is found to be the syntactic subject of a token. 
Behavior: if an active token h is in a subject noun relationship with a word t then:
1. if t has any adverbial or adjective modifier and the modifier exists in SenticNet, then t is extracted
as an aspect.

In [None]:

def rule0(doc,aspects):   
    token_h = ""
    token_t = ""
    for i in doc:
        if i.dep_ == "nsubj" or i.dep_ == "nsubj:pass":
            token_h = i
            token_t = i.head.text
            break; 
    for i in doc:
        if i.head.text == token_t:
            if i.dep_ == "advmod" or i.dep_ == "adjmod":
                try:
                    concept = bsn.concept(i.text)
                except Exception as e:
                    return 
                finally:
                    aspects.append(token_t)
    return aspects

## Rule 1
References: [1].

1, example: A **bateria** *dura pouco*.

> if the sentence does not have auxiliary verb, i.e., is, was, would, should, could, then:
• if the verb t is modified by an adjective or an adverb or it is in adverbial clause modifier relation
with another token, then both h and t are extracted as aspects. In (1), battery is in a subject
relation with lasts and lasts is modified by the adjective modifier little, hence both the aspects
last and battery are extracted.
(1)
The battery lasts little.

In [None]:
def rule1(doc, aspects):
    token_h = relation_nsubj(doc)[0]
    token_t = relation_nsubj(doc)[1]
    if(not is_auxiliar(doc)):
        for i in doc:
            if ((i.dep_ == "advmod" and i.head.text == token_t) or 
               (i.dep_ == "adjmod" and i.head.text == token_t)  or 
               (i.dep_ == "advcl" and i.head.text == token_t) or 
                (i.dep_ == "amod" and i.head.text == token_t)):
                aspects.append(token_h)
                aspects.append(token_t)
    return aspects

## Rules 2 and 3

References: [1].

2 e 3, examples: Eu gosto das **lentes** da câmera, eu gosto da **beleza** da tela.

> If t has any direct object relation with a token n and the POS of the token is Noun and n is not
in SenticNet, then n is extracted as an aspect. In (2), like is in direct object relation with lens
so the aspect lens is extracted.
(2)
I like the lens of this camera.
• if t has any direct object relation with a token n and the POS of the token n is Noun and n exists
in SenticNet, then the token n extracted as aspect term. In the dependency parse tree of the
sentence, if another token n 1 is connected to n using any dependency relation and the POS of
n 1 is Noun, then n 1 is extracted as an aspect. In (3), like is in direct object relation with beauty
which is connected to screen via a preposition relation. So the aspects screen and beauty are
extracted.
(3)
I like the beauty of the screen.

In [None]:
def rule2_3(doc, aspects):
    token_h = relation_nsubj(doc)[0]
    token_t = relation_nsubj(doc)[1]
    token_n = ""
    token_n1 = ""
    auxiliar = 0
    flag = 0
    pair = ("","")
    if not is_auxiliar(doc):
        for i in doc:
            if ((i.dep_ == "obj" or i.dep_ == "obj:pass") and 
                i.head.text == token_t):
                if i.pos_ == "NOUN":
                    token_n = i.text
                    try:
                        concept = bsn.concept(token_n)
                    except Exception as e: 
                        flag = 1
                        aspects.append(token_n) 
                    if flag != 1:
                        sentiment = token_n 
                        pair = ("", sentiment)
        if not flag:
            for i in doc:
                if i.dep_ and i.head.text == token_n and i.pos_ == "NOUN":
                    flag = 1
                    token_n1 = i.text
                    aspect = token_n1
                    pair = (token_n1, sentiment)
                    aspects.append(pair)                
                    break;
            if not flag:
                aspects.append(pair)
    return aspects

## Rule 4
References: [1].

4, example: Eu gostaria de comentar sobre a **câmera** do celular.

> If t is in open clausal complement relation with a token t1 , then the aspect t-t1 is extracted if t-t1
exists in the opinion lexicon. If t1 is connected with a
token t2 whose POS is Noun, then t2 is
extracted as an aspect. In (4), like and comment is in clausal complement relation and comment
is connected to camera using a preposition relation. Here, the POS of camera is Noun and,
hence, camera is extracted as an aspect.
(4)
I would like to comment on the camera of this phone.


In [None]:
def rule4(doc, aspects):
    token_h = relation_nsubj(doc)[0]
    token_t = relation_nsubj(doc)[1]
    if not is_auxiliar(doc):
        for i in doc:
            if i.dep_ == "xcomp" and i.head.text == token_t:
                flag = 0
                token_t1 = i.text;
                try:
                    concept = bsn.concept(token_t1)
                except Exception as e:
                    flag = 0 
                finally:
                    flag = 1
                flag = 0
                try:
                    concept = bsn.concept(token_t1)
                except Exception as e:
                    flag = 0 
                finally:
                    flag = 1
                if flag:
                    ok = 0
                    for i in doc:
                        if i.head.text == token_t1:
                            token_h = i
                            ok = 1
                    if ok and token_h.pos_ == "NOUN":
                        aspects.append(token_h)
                    elif (ok and token_h.pos_ != "NOUN"):
                        aspect = ' '.join([token_t, token_t1])
                        aspects.append(aspect)
        return aspects

## Rule 6

References: [1].

6, example: A **câmera** é *boa*.

> If the token t is in copula relation with a copular verb and the POS of h is Noun, then h is extracted
as an explicit aspect. In (6), camera is extracted as an aspect.
(6) The camera is nice.


In [None]:
def rule6(doc, aspects):
    token_h, token_t = relation_nsubj(doc)
    for i in doc:
        if i.dep_ == "cop":
            for j in doc:
                if j.text == token_h:
                    if j.pos_ == "NOUN":
                        aspects.append(token_h)
    return aspects

##Rule 9
References: [1].

9, example: Amo a **suavidade do jogador**.


> if a token h is connected to a noun t using a prepositional relation, then both h and t are extracted as
aspects. In (9) sleekness is extracted as an aspect.
(9) Love the sleekness of the player.
>
> Additional Rules
>
> A noun compound modifier of an NP is any noun that serves to modify the head noun. If t is
extracted as an aspect and t has noun compound modifier h, then the aspect h-t is extracted and t
is removed from the aspect list. In (12), as chicken and casserole are in noun compound modifier
relation, only chicken casserole is extracted as an aspect.
(12)
We ordered the chicken casserole, but what we got were a few small pieces of chicken, all
dark meat and on the bone.


In [None]:
def rule9(doc, aspects):
    for i in doc:
        if (i.head.pos_ == "NOUN"):
            if (i.dep_ == "nmod" or i.dep_ == "amod" or i.dep_ == "acl" or i.dep_ == "acl:relcl"):
                token_h = i.text
                token_t = i.head.text
                if (i.dep_ == "nmod" and i.head.pos_ == "NOUN"):
                    f = 0
                    for j in doc:
                        if (j.dep_ == "case" and j.head.text == token_h):
                            f = 1
                            aspect = ' '.join([token_t, j.text, token_h])
                            aspects.append(aspect)
                            break
                    if not f:
                        aspect = ' '.join([token_t, token_h])
                        aspects.append(aspect)
                else:
                    aspects.append(token_t)
                break;
    return aspects

## Rule 10
References: [1].

10, example: Ana achou o **livro** *maravilhoso*.


> if a token h is in a direct object relation with a token t, t is extracted as aspect. In (10), mention is in
a direct object relation with price, hence price is extracted as an aspect.
(10)
Not to mention the price of the phone.


In [None]:
def rule10(doc, aspects):
    token_t = ""
    token_h = ""
    for i in doc:
        if i.dep_ == "obj" or i.dep_ == "nsubj:pass":
            token_t = i.text
            token_h = i.head.text
            aspects.append(token_t)
            break;

    return aspects

##Rule 11

References: [1].

11, example: A **câmera** é *incrível* e *fácil de usar*.


> For each aspect term extracted above, if an aspect term h is in co-ordination or conjunct relation
with another token t, then t is also extracted as an aspect. In (11), amazing is firstly extracted as an
aspect term. As amazing is in conjunct relation with easy, then use is also extracted as an aspect.
(11)
The camera is amazing and easy to use.

In [None]:
def rule11(doc, aspects):
    token_h = relation_nsubj(doc)[0]
    token_t = relation_nsubj(doc)[1]
    sentiment1 = sentiment2 = ""
    for i in doc:
        if i.text == token_t:
            token_t1 = i.head.text
            if i.pos_ == "ADJ":
                sentiment1 = token_t1
    for i in doc:
        if i.dep_ == "conj" and i.head.text == sentiment1:
            sentiment2 = i.text
    for i in doc:
        if i.head.text == sentiment2 and i.dep_ == "xcomp":
            aspects.append(i.text)
    return aspects

# Rule 1
References: [2].

1, example: **vídeo** era *ruim*

> nsubj(JJ/OP,NN) 

In [None]:
def rule1_2(doc, aspects):
    for NN in doc:
        if NN.dep_ == 'nsubj' and NN.head.pos_ == 'ADJ':
            aspects.append(NN.text)
            break 
    return aspects

#Rule 2
References: [2].

2, example: a **qualidade** e a **lente** da câmera estão *aprovadas*.

> adoro a qualidade e a lente
ReL1(H1,NN1) and ReL2(H1,NN2)
such that ReL1 and ReL2 any dependency relation from
[‘nsubj’, ‘amod’, ‘prep’, ‘csubj’, ‘xsubj’, ‘dobj’, ‘iobj’]


In [None]:
def rule2_2(doc, aspects):
    rel = ["nsubj", "amod", "prep", "csubj", "xsubj", "xsubj", "dobj", "iobj", "obj"]
    for i in doc:
        if i.dep_ == "conj" and i.head.dep_ in rel:
            if i.pos_ == "NOUN" and i.head.pos_ == "NOUN":
                aspects.append(i.text)
                aspects.append(i.head.text)
                return aspects
    d = {}
    for i in doc:
        token_children = list(i.children)
        for j in token_children:
            if (j.dep_ in rel and j.pos_ == "NOUN"):
                d[j.dep_] = (j.text, j.pos_)
    for i in d:
        aspects.append(d[i][0])
    return aspects

#Rule 3

References: [2].

3, example: Honestamente, eu amo esse **jogador**.

> nsubj(VB1,H1) and dobj(VB1,NN)

In [None]:
def rule3_2(doc, aspects):
    # verbo 
    VB1 = relation_nsubj(doc)[1]
    for i in doc:
        if i.text == VB1 and i.pos_ == "VERB":
            for j in doc:
                if (j.dep_ == "obj" or j.dep == "dobj") and j.pos_ == "NOUN":
                    aspects.append(j.text)
                    return aspects

#Rule 4

References: [2].

4, example: seu **tamanho** também faz dele *ideal* para viajar

> in english: 
>
> nsubj(H1,NN) and xcomp(H1,JJ/OP) 
>
> in portuguese:
>
> nsubj(H1, NN) e obj(H1, JJ/OP)
> nsubj(H1, NN) e ccomp(H1, JJ/OP)

In [None]:
def rule4_2(doc, aspects):
    for i in doc:
        token_children = list(i.children) # os filhos de i
        d = {} 
        for token in token_children:
            d[token.dep_] = (token.text, token.pos_) # o dicionário é {"obj" : "NOUN", "ideal"}
        if "nsubj" in d and "obj" in d:
            if d["nsubj"][1] == "NOUN":
                if d["obj"][1] == "ADJ":
                    aspects.append(d["nsubj"][0])
                    return aspects
                else:
                    if d["obj"][0] in op:
                        aspects.append(d["nsubj"][0])
                        return aspects

#Rule 5

References: [2].

5, example: toca **dvds** e **cds** *originais*.

> amod(NN1,OP/JJ) and conj(NN1,NN2)


In [None]:
def rule5_2(doc, aspects):
    for i in doc:
        if i.dep_ == "amod" and (i.pos_ == "ADJ" or i.text in op):
            j = i.head
            if j.pos_ == "NOUN" and j.dep_ == "conj" and j.head.pos_ == "NOUN":
                aspects.append(j.text)
                aspects.append(j.head.text)
                return aspects 

#Rule 6

References: [2].

6, example: acho a falta de **jogos** de entretenimento nesse celular um pouco *incômoda*

> nmod(OP/JJ,NNS)

In [None]:
def rule6_2(doc, aspects):
    for i in doc:
        token = i.text
        if (i.dep_ == "nmod" and i.lemma_ != i.text and i.pos_ == "NOUN" 
            and token[len(token)-1].lower() == 's'):
            if (i.head.pos_ == "ADJ" or i.head.text in op):
                aspects.append(i)
                return aspects

#Rule 7

References: [2].

7, example: o **manual** *escasso*

> amod(**NN**,OP)

In [None]:
sentence = "o manual facil"
doc = nlp(sentence)
def rule7_2(doc, aspects):
    for i in doc:
        if (i.dep_ == "amod" and i.head.pos_ == "NOUN" and i.text in op):
            aspects.append(i.head.text)
            return aspects
displacy.render(doc, style='dep', jupyter=True)

#Rule 8

References: [2].

8, example: esta **câmera** tem uma grande falha de design


> Rel1(H1,NN) and Rel2(H1, OP/JJ)
such that ReL1 and ReL2 any dependency relation from
[‘nsubj’, ‘csubj’, ‘xsubj’, ‘dobj’, ‘iobj’]

In [None]:
def rule8_2(doc, aspects):
    for i in doc:
        children_list = list(i.children)
        dep_set = ["nsubj", "csubj", "xsubj", "dobj", "iobj", "obj"]
        d = {}
        for token in children_list:
            if token.dep_ in dep_set:
                d[token.dep_] = (token.text, token.pos_)
        if len(d) >= 2:
            REL2 = 0
            for i in d:
                if d[i][1] == "ADJ" or d[i][0] in op:
                    REL2 = 1
            if REL2:
                for i in d:
                    if d[i][1] == "NOUN":
                        aspects.append(d[i][0])
                        return aspects

#Rule 9
References: [2].

9, example: minha única reclamação sobre o hardware são os **botões**


> nsubj(NN,OP/JJ)

In [None]:
def rule9_2(doc, aspects):
    for i in doc:
        if i.dep_ == "nsubj" and i.head.pos_ == "NOUN":
            if (i.pos_ == "ADJ" or i.text in op):
                aspects.append(i.head.text)
                return aspects

#Rule 10

References: [2].

10, example: Eu gostei especialmente dos **botões** mais comumente usados

> dobj(OP/JJ,NN) 
> 
> the POS of "gosto de " isn't being tagged well

In [None]:
def rule10_2(doc, aspects):
    for i in doc:
        if i.pos_ == "NOUN" and (i.dep_ == "obj" or i.dep_ == "dobj" or i.dep_ == "iobj"):
            if (i.head.text in op or i.head.pos_ == "ADJ"):
                aspects.append(i.text)
                return aspects
        elif i.pos_ == "NOUN" and i.dep_ == "nmod":
            if i.head.text in op:
                aspects.append(i.text)
                return aspects

#Rule 11

References: [2].

11, example: eu acho **exibições na tela** *irritantes*
> obj(OP/JJ,NN1) and nmod(NN1,NN2)


In [None]:
def rule11_2(doc, aspects):
    for i in doc:
        if i.dep_ == "nmod" and i.pos_ == "NOUN":
            if i.head.dep_ == "obj" and i.head.pos_ == "NOUN":
                if i.head.nbor().pos_ == "ADP":
                    aspects.append(i.head.text+' '+i.head.nbor().text+' '+i.text)
                    return aspects

#Rule 12

References: [2].

12, example: **qualidade** e **lentes** da câmera *comprovadas*

> conj(NN1,NN2) 


In [None]:
def rule12_2(doc, aspects):
    for i in doc: 
        if i.dep_ == "conj" and i.pos_ == "NOUN" and i.head.pos_ == "NOUN":
            j = i.head
            aspects.append(j.text)
            aspects.append(i.text)
            return aspects

# Rule 13

References: [2].

13, example: No geral, o g3 entrega o que deve ser considerado a *melhor* **qualidade de imagem**.

> amod(NN1,OP/JJ) and nmod(NN1,NN2)


In [None]:
def rule13_2(doc, aspects):
    for i in doc:
        if i.dep_ == "case":
            j = i.head
            if j.dep_ == "nmod" and j.pos_ == "NOUN":
                k = j.head
                token_children = list(k.children)
                for l in token_children:
                    if l.dep_ == "amod":
                        aspects.append(k.text+" "+i.text+" "+j.text)
                        return aspects
        

#Rule 14

References: [2].

14, example: As *cores* na tela não são tão *nítidas* quanto eu gostaria que fossem.

> neg(OP/JJ, H1) and nsubj(OP/JJ,NN)


In [None]:
def rule14_2(doc, aspects):
    for i in doc: 
        token_children = list(i.children)
        dep_children = []
        for j in token_children:
            dep_children.append(j.dep_)
        if ("advmod" in dep_children or "neg" in dep_children) and ("nsubj" in 
            dep_children or "nsubj:pass" in dep_children) and (i.pos_ == "ADJ" or i.text in op):
            for k in token_children:
                if k.pos_ == "NOUN" and k.dep_ == "nsubj":
                    aspects.append(k.text)
                    return aspects 

#Rule 15

References: [2].

15, example: definitivamente, uma *boa* **câmera**

> ReL(NN, OP/JJ)
such that ReL any dependency relation from
[‘nsubj’, ‘amod’, ‘prep’, ‘csubj’, ‘xsubj’, ‘dobj’, ‘iobj’]


In [None]:
def rule15_2(doc, aspects):
    for i in doc:
        rel = ["nsubj", "amod", "prep", "csubj", "xsubj", "dobj", "iobj", "obj"]
        if i.dep_ in rel:
            if (i.head.text in op or i.head.pos_ == "ADJ") and i.pos_ == "NOUN":
                aspects.append(i.text)
                return aspects
            elif (i.pos_ == "ADJ" or i.text in op) and i.head.pos_ == "NOUN":
                aspects.append(i.head.text)
                return aspects

#Rule 17

References: [2].

17, example: Os **cardápios** são *fáceis* de navegar.


> nsubj(OP1/JJ1,NN) and cop (OP1/JJ1,H1) 


In [None]:
def rule17_2(doc, aspects):
    for i in doc: 
        token_children = list(i.children)
        dep_children = []
        for j in token_children:
            dep_children.append(j.dep_)
        if "nsubj" in dep_children and "cop" in dep_children:
            for k in token_children:
                if k.pos_ == "NOUN":
                    aspects.append(k.text)
                    return aspects

# Golden Standard
* generating set of pair (sentence-aspect) from dataset 

In [None]:
def get_golden_standard(file, set_of_rules):
    if file == glob.glob('*.xml'):
        tree = ET.parse(file[0])
        root = tree.getroot()
        pair = ("","")
        pair_sentence_aspect = set()
        f = 0
        for review in root.iter('review'):
            for x in review.iter():
                if x.tag == 'sentence':
                    if f == 1:
                        pair = (phrase, "")
                    phrase = x.text
                    f = 1
                if x.tag == 'opinion':
                    f = 0
                    pair = (phrase, x.attrib['aspect'])
                    pair_sentence_aspect.add(pair)
    else:
        data = pd.read_json(file[0])
        tags = data["reviews"]
        par = ("", "")
        pair_sentence_aspect = set()
        for tag in tags: 
            for aspect in tag["explicit aspects"]:
                par = (tag["review"], aspect)
                pair_sentence_aspect.add(par)
    return pair_sentence_aspect






# Method Of Extracting Aspects
* extracting aspects with implemented rules


In [None]:
def get_aspects_from_rules(file, set_of_rules):
    aspects = []
    sentences = []
    set_aspects = []
    if file == glob.glob("*.xml"):
        sentences = get_golden_standard(file, set_of_rules)
        for sentence in sentences:
            doc = nlp(sentence[0])
            for rule in set_of_rules:
                rule(doc, aspects)
        set_aspects = set(aspects)
    return set_aspects

# def get_aspects_from_rules(file, set_of_rules):
#     aspects = []
#     sentences = []
#     set_aspects = []
#     if file == glob.glob('*.xml'):
#         tree = ET.parse(file[0])
#         root = tree.getroot()
#         for phrase in root.iter('sentence'):
#             dic = (phrase.text)
#             sentences.append(dic)
#         sentences_spacy = nlp.pipe(sentences, n_process=10)
#         for doc in sentences_spacy:
#             for rule in set_of_rules:
#                 rule(doc,aspects)
#         set_aspects = set(aspects)
#     else:
#         data = pd.read_json(file[0])
#         tags = data["reviews"]
#         for tag in tags:
#             sentences.append(tag["review"])
#         for sentence in sentences:
#             doc = nlp(sentence)
#             for rule in set_of_rules:
#                 rule(doc, aspects)
#         set_aspects = set(aspects)

#     return (set_aspects)

# Generating Metrics
* comparing training set and extracted aspects from rules
* applying the evaluation metrics 

In [None]:
def get_metrics(file, set_of_rules):
    # Precision: 
    
    aspect = get_aspects_from_rules(file, set_of_rules)
    aspects = []
    for a in aspect:
        a = str(a)
        aspects.append(a.lower())
    golden_standard = get_golden_standard(file, set_of_rules)
    dataset = []
    for a in golden_standard:
        if a[1] != '':
            dataset.append(a[1].lower())
    relevant_element = 0
    for r in aspects:
        for g in set(dataset):
            print(f'r={r}')
            print(f'g={g}')
            if r == g:
                print('+')
                relevant_element+=1
            print(" ")
    precision = relevant_element/len(aspects)
    
    # Recall: 

    relevant_element = 0
    for g in dataset:
        for r in set(aspects):
            if r == g:
                relevant_element+=1
    recall = relevant_element/len(dataset)

    # F1-score
    
    f1 = 2 * precision * recall / (precision + recall)
    
    
    return [precision, recall, f1]


#Set of Rules

In [None]:
metrics = [
            [rule0, 0.05, 0.048, 0.047],
            [rule1, 0.18, 0.26, 0.21],
            [rule2_3, 0.12,0.10,0.11],
            [rule6, 0.6, 0.61, 0.61],
            [rule9, 0.28, 0.7, 0.4],
            [rule10, 0.25, 0.58, 0.35],
            [rule1_2, 0.65, 0.56, 0.6],
            [rule2_2, 0.27, 0.72, 0.39],
            [rule3_2, 0.31, 0.43, 0.36],
            [rule4_2, 0.62, 0.28, 0.39],
            [rule5_2, 0.63, 0.53, 0.58],
            [rule6_2, 0.4, 0.014, 0.02],
            [rule7_2, 0.61, 0.56, 0.59],
            [rule8_2, 0.5, 0.42, 0.46],
            [rule9_2, 0.5, 0.05,0.1],
            [rule10_2, 0.26, 0.27, 0.26],
            [rule11_2, 0.06,0.01, 0.019],
            [rule12_2, 0.4, 0.579, 0.47],
            [rule13_2, 0.09,0.01, 0.03],
            [rule14_2, 0.91, 0.46, 0.61],
            [rule15_2, 0.42, 0.68, 0.52],
            [rule17_2, 0.52, 0.60, 0.56],
           ]
metrics_json = [
             [rule0, 0.07, 0.3, 0.11],
             [rule1, 0.27, 0.64, 0.38],
             [rule2_3, 0.21, 0.16, 0.18],
             [rule4, 0.1, 0.11, 0.11],
             [rule6, 0.58, 0.70, 0.64],
             [rule9, 0.3, 0.81, 0.43],
             [rule10, 0.28, 0.76, 0.41],
             [rule1_2, 0.55, 0.73, 0.63],
             [rule2_2, 0.31, 0.79, 0.44],
             [rule3_2, 0.35, 0.70, 0.46],
             [rule4_2, 0.64, 0.46, 0.53],
             [rule5_2, 0.68, 0.73, 0.70],
             [rule6_2, 0.25, 0.005, 0.01],
             [rule7_2, 0.62, 0.73, 0.67],
             [rule8_2, 0.38, 0.479, 0.42],
             [rule9_2, 0.61, 0.19, 0.29],
             [rule10_2, 0.391, 0.65, 0.48],
             [rule11_2, 0.11, 0.018, 0.03],
             [rule12_2, 0.42, 0.77, 0.54],
             [rule13_2, 0.06, 0.01, 0.01],
             [rule14_2,0.65, 0.68, 0.66],
             [rule15_2, 0.479, 0.76, 0.588],
             [rule17_2, 0.539, 0.72, 0.618],
]

best = []
set_of_rules = []
best_precision = -1
best_f1 = -1
for i in metrics:
    if i[1] > best_precision:
        best_precision = i[1];
        best = i
        best_f1 = best[3]
set_of_rules.append(best[0])
print("melhor regra para a1: ", best[0], best[1], best[2], best[3], "\n")
metrics.remove(best)
results = []

while (True):
    optimal_rule = []
    for i in metrics:
        set_of_rules.append(i[0])
        results = get_metrics(glob.glob('*.xml'), set_of_rules)
        print("combinação de", len(set_of_rules)-1, "regras com", i[0])
        print("precisão, revocação e f1:",results[0], results[1], results[2],"\n")
        if results[2] > best_f1:
            best_f1 = results[2]
            optimal_rule = i
        set_of_rules.remove(i[0])
    if len(optimal_rule) != 0:
        print("\n\nregra adicionada: ", optimal_rule[0], "\n\n")
        set_of_rules.append(optimal_rule[0])
        metrics.remove(optimal_rule)
    else: break
print("\n\nregras que pertencem ao conjunto:")
for i in set_of_rules:
    print(i)
print("\n\n")

print(get_metrics(glob.glob('*.xml'), set_of_rules))



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
r=coqueiro
g=cardapio regional
 
r=coqueiro
g=buffet
 
r=coqueiro
g=self service
 
r=coqueiro
g=sabores regionais
 
r=coqueiro
g=sorvete de creme de cupuaçu
 
r=coqueiro
g=ambiente
 
r=coqueiro
g=vista
 
r=coqueiro
g=entrada
 
r=coqueiro
g=peixaria
 
r=coqueiro
g=coco bambu
 
r=coqueiro
g=pixzzas
 
r=coqueiro
g=comidas tipicas
 
r=coqueiro
g=acompanhamentos
 
r=coqueiro
g=casa
 
r=coqueiro
g=tambaqui
 
r=coqueiro
g=carne sol
 
r=coqueiro
g=atendimento
 
r=coqueiro
g=cafezinho
 
r=coqueiro
g=vinho
 
r=coqueiro
g=garçom
 
r=coqueiro
g=carnes
 
r=coqueiro
g=serviço
 
r=coqueiro
g=som ao vivo
 
r=coqueiro
g=chopp
 
r=coqueiro
g=serviço de entrega
 
r=coqueiro
g=preços
 
r=coqueiro
g=peixe
 
r=coqueiro
g=variedade de sucos
 
r=coqueiro
g=estacionamento
 
r=coqueiro
g=situado
 
r=coqueiro
g=pratos
 
r=coqueiro
g=cachaçaria do dedé
 
r=coqueiro
g=cervejas
 
r=coqueiro
g=preço dos pratos
 
r=coqueiro
g=decoração
 
r=coqueiro
g=te

# Running 

In [None]:
# %%time
# def main():
#     json_file = glob.glob('*.json')
#     xml_file = glob.glob('*.xml')
#     file = xml_file
#     if xml_file:
#         set_of_rules = [rule5_2, rule14_2]
#         results = get_metrics(xml_file, set_of_rules)
#         print(results[0], results[1], results[2])
        
#     if json_file:
#         get_metrics(json_file)
# main()
# get_golden_standard(glob.glob('*.xml'),set_of_rules)

#General results

* for xml:

 Precision: 0.17142857142857143

 Recall: 0.8475836431226765

 F1: 0.2851782363977486

* for json:

    Precision: 0.16750178954903364

    Recall: 0.8882534095908491

    F1: 0.2818532853083961

In [None]:
data_xml = pd.DataFrame([
    {"Rule" : "0", "Precision" : 0.05, "Recall" : 0.048, "F1" : 0.047},
    {"Rule" : "1", "Precision" : 0.18, "Recall" : 0.26, "F1" : 0.21},
    {"Rule" : "2", "Precision" : 0.12, "Recall" : 0.10, "F1" : 0.11},
    {"Rule" : "3", "Precision" : 0.12, "Recall" : 0.10, "F1" : 0.11},
    {"Rule" : "6", "Precision" : 0.60, "Recall" : 0.61, "F1" : 0.61},
    {"Rule" : "9", "Precision" : 0.28, "Recall" : 0.70, "F1" : 0.40},
    {"Rule" : "10", "Precision" : 0.25, "Recall" : 0.58, "F1" : 0.35}, 
    {"Rule" : "1_2", "Precision" : 0.65, "Recall" : 0.56, "F1" : 0.60},
    {"Rule" : "2_2", "Precision" : 0.27, "Recall": 0.72, "F1" : 0.39},
    {"Rule" : "3_2", "Precision" : 0.31, "Recall": 0.43, "F1" : 0.36},
    {"Rule" : "4_2", "Precision" : 0.62, "Recall": 0.28, "F1" : 0.39},
    {"Rule" : "5_2", "Precision" : 0.63, "Recall": 0.53, "F1" : 0.58},
    {"Rule" : "6_2", "Precision" : 0.4, "Recall": 0.014, "F1" : 0.02},
    {"Rule" : "7_2", "Precision" : 0.61, "Recall": 0.56, "F1" : 0.59},
    {"Rule" : "8_2", "Precision" : 0.5, "Recall": 0.42, "F1" : 0.46},
    {"Rule" : "9_2", "Precision" : 0.5, "Recall" : 0.05, "F1": 0.1},
    {"Rule" : "10_2", "Precision" : 0.26, "Recall" : 0.27, "F1": 0.26},
    {"Rule" : "11_2", "Precision" : 0.06, "Recall": 0.01, "F1" : 0.019},
    {"Rule" : "12_2", "Precision" : 0.4, "Recall": 0.579, "F1" : 0.47},
    {"Rule" : "13_2", "Precision" : 0.09, "Recall": 0.01, "F1" : 0.03},
    {"Rule" : "14_2", "Precision" : 0.91, "Recall" : 0.46, "F1" : 0.61},
    {"Rule" : "15_2", "Precision" : 0.42, "Recall" : 0.68, "F1" : 0.52},
    {"Rule" : "17_2", "Precision" : 0.52, "Recall" : 0.60, "F1" : 0.56},
])
print(data_xml)



    Rule  Precision  Recall     F1
0      0       0.05   0.048  0.047
1      1       0.18   0.260  0.210
2      2       0.12   0.100  0.110
3      3       0.12   0.100  0.110
4      6       0.60   0.610  0.610
5      9       0.28   0.700  0.400
6     10       0.25   0.580  0.350
7    1_2       0.65   0.560  0.600
8    2_2       0.27   0.720  0.390
9    3_2       0.31   0.430  0.360
10   4_2       0.62   0.280  0.390
11   5_2       0.63   0.530  0.580
12   6_2       0.40   0.014  0.020
13   7_2       0.61   0.560  0.590
14   8_2       0.50   0.420  0.460
15   9_2       0.50   0.050  0.100
16  10_2       0.26   0.270  0.260
17  11_2       0.06   0.010  0.019
18  12_2       0.40   0.579  0.470
19  13_2       0.09   0.010  0.030
20  14_2       0.91   0.460  0.610
21  15_2       0.42   0.680  0.520
22  17_2       0.52   0.600  0.560


In [None]:

data_json = pd.DataFrame([
    {"Rule" : "0", "Precision" : 0.07 , "Recall" : 0.3, "F1" : 0.11},
    {"Rule" : "1", "Precision" : 0.27, "Recall" : 0.64, "F1" : 0.38},
    {"Rule" : "2", "Precision" : 0.21, "Recall" : 0.16, "F1" : 0.18},
    {"Rule" : "3", "Precision" : 0.21, "Recall" : 0.16, "F1" : 0.18},
    {"Rule" : "4", "Precision" : 0.1, "Recall" : 0.11, "F1" : 0.11},
    {"Rule" : "6", "Precision" : 0.58, "Recall" : 0.70, "F1" : 0.64},
    {"Rule" : "9", "Precision" : 0.3, "Recall" : 0.81, "F1" : 0.43},
    {"Rule" : "10", "Precision" : 0.28, "Recall" : 0.76, "F1" : 0.41},
    {"Rule" : "1_2", "Precision" :0.55 , "Recall" : 0.73, "F1" :0.63},
    {"Rule" : "2_2", "Precision" : 0.31, "Recall": 0.79, "F1" : 0.44 },
    {"Rule" : "3_2", "Precision" : 0.35, "Recall": 0.70, "F1" : 0.46},
    {"Rule" : "4_2", "Precision" : 0.64, "Recall": 0.46, "F1" : 0.53},
    {"Rule" : "5_2", "Precision" : 0.68, "Recall": 0.73, "F1" : 0.70},
    {"Rule" : "6_2", "Precision" : 0.25, "Recall": 0.005, "F1" :0.01},
    {"Rule" : "7_2", "Precision" : 0.62, "Recall": 0.73, "F1" : 0.67},
    {"Rule" : "8_2", "Precision" : 0.38, "Recall": 0.479, "F1" : 0.42},
    {"Rule" : "9_2", "Precision" : 0.61, "Recall" :0.19, "F1": 0.29},
    {"Rule" : "10_2", "Precision" :0.391, "Recall" :0.65, "F1": 0.48},
    {"Rule" : "11_2", "Precision" :0.11, "Recall": 0.018, "F1" :0.03},
    {"Rule" : "12_2", "Precision" :0.42, "Recall": 0.77, "F1" :0.54},
    {"Rule" : "13_2", "Precision" :0.06, "Recall": 0.01, "F1" : 0.01},
    {"Rule" : "14_2", "Precision" : 0.65, "Recall" :0.68, "F1" :0.66},
    {"Rule" : "15_2", "Precision" :0.479, "Recall" :0.76, "F1" :0.588},
    {"Rule" : "17_2", "Precision" :0.539, "Recall" :0.72, "F1" :0.618},
])
print(data_json)


    Rule  Precision  Recall     F1
0      0      0.070   0.300  0.110
1      1      0.270   0.640  0.380
2      2      0.210   0.160  0.180
3      3      0.210   0.160  0.180
4      4      0.100   0.110  0.110
5      6      0.580   0.700  0.640
6      9      0.300   0.810  0.430
7     10      0.280   0.760  0.410
8    1_2      0.550   0.730  0.630
9    2_2      0.310   0.790  0.440
10   3_2      0.350   0.700  0.460
11   4_2      0.640   0.460  0.530
12   5_2      0.680   0.730  0.700
13   6_2      0.250   0.005  0.010
14   7_2      0.620   0.730  0.670
15   8_2      0.380   0.479  0.420
16   9_2      0.610   0.190  0.290
17  10_2      0.391   0.650  0.480
18  11_2      0.110   0.018  0.030
19  12_2      0.420   0.770  0.540
20  13_2      0.060   0.010  0.010
21  14_2      0.650   0.680  0.660
22  15_2      0.479   0.760  0.588
23  17_2      0.539   0.720  0.618


# REFERENCES

[1] PORIA, Soujanya et al. A rule-based approach to aspect extraction from product reviews. In: Proceedings of the second workshop on natural language processing for social media (SocialNLP). 2014. p. 28-37.

[2] M. Tubishat, N. Idris, and M. Abushariah, ‘‘Explicit aspects extraction in sentiment analysis using optimal rules combination,’’ Future Gener. Comput. Syst., vol. 114, pp. 448–480, Jan. 2021

[3] Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
;       Proceedings of the ACM SIGKDD International Conference on Knowledge 
;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
;       Washington, USA