<a href="https://colab.research.google.com/github/valmirf/mineracao_textual/blob/main/NER/08b_ExtracaoInformacao_CRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/valmirf/mineracao_textual.git

# Extraindo Entidades Nomeadas

In [None]:
import pandas as pd
import numpy as np

# Lendo arquivo de entrada

In [None]:
df = pd.read_csv('mineracao_textual/Dados/ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:10000] #apenas para processar mais rápido
df.head()

Exsitem 457 sentenças contendo 2.746 palavras diferentes e 17 tags.

In [None]:
df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

In [None]:
df.groupby('Tag').size().reset_index(name='counts')

# Transformando dados para vetor e criando treino/teste -> para algoritmos tradicionais!

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

X = df.drop('Tag', axis=1) #elimina o rótulo
v = DictVectorizer(sparse=False) #mapeia palavras para índices
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values

classes = np.unique(y)
classes = classes.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train.shape, y_train.shape

In [None]:
X

# CRF

In [None]:
!pip install sklearn_crfsuite
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

Função para recuperar sentenças com os POS e as tags

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

# Criando o formato de entrada do CRF (extração de características)

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]
    
def sent2tokens(sent):
    return [token for token, postag, label in sent]

# Divisão de treinamento e teste

In [None]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [None]:
X

In [None]:
y

# Treinando o modelo CRF

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# Avaliação

In [None]:
y_pred = crf.predict(X_test)
print('Resultados com todas as classes')
print(metrics.flat_classification_report(y_test, y_pred))
print('--------------------------------------------------------------------')
print('Resultados eliminando a classe O')
print(metrics.flat_classification_report(y_test, y_pred, labels=['B-art','B-eve','B-geo','B-gpe','B-nat','B-org','B-per','B-tim','I-art','I-eve','I-geo','I-gpe','I-nat','I-org','I-per','I-tim'], target_names=['B-art','B-eve','B-geo','B-gpe','B-nat','B-org','B-per','B-tim','I-art','I-eve','I-geo','I-gpe','I-nat','I-org','I-per','I-tim']))

# Entendendo o funcionamento do algoritmo

In [None]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top-20 transições mais prováveis:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("Top-20 transições menos prováveis:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top-20 positivo:")
print_state_features(Counter(crf.state_features_).most_common(20))

print("Top-20 negativo:")
print_state_features(Counter(crf.state_features_).most_common()[-20:])

In [None]:
!pip install eli5
import eli5
eli5.show_weights(crf, top=10)