In [3]:
import json
from tqdm import tqdm

import spacy
from spacy.util import decaying
from spacy.util import minibatch, compounding
from spacy.training import offsets_to_biluo_tags
from spacy.training.example import Example
from spacy.training import biluo_tags_to_offsets
from spacy.training import offsets_to_biluo_tags
from spacy.scorer import Scorer


import pandas as pd
import random
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

In [4]:
print(spacy.__version__)

3.7.4


## Converter Formato DataFrame to Spacy

In [11]:
df_train = pd.read_csv("data/df_train_tokens_labeled_iob_bert_format_full.csv", encoding="utf-8")
df_test = pd.read_csv("data/df_test_tokens_labeled_iob_bert_format_full.csv", encoding="utf-8")

In [12]:
def bert_2_spacy_format(df):
    spacy_format_data = []
    for _, row in df.iterrows():
        text = row['text']
        entities = []
        tokens = row['text'].split()
        labels = row['iob_labels'].split()
        start = 0
        entity_label = None
        for token, label in zip(tokens, labels):
            end = start + len(token)
            if label != 'O':
                label = label.replace('B-', '').replace('I-', '').upper()
                if label != entity_label:
                    entity_label = label
                    entities.append((start, end, label))
            start = end + 1  # +1 to account for the space between tokens
        spacy_format_data.append((text, {"entities": entities}))
    return spacy_format_data

In [13]:
train_spacy_format = bert_2_spacy_format(df_train)
test_spacy_format = bert_2_spacy_format(df_test)

In [10]:
#training_data, test_data = train_test_split(spacy_format_data, test_size=0.10, random_state=1234)

## Treinar o modelo

In [14]:
def train_spacy(data, iterations):
    nlp = spacy.blank("pt")  
    ner = nlp.add_pipe("ner")

    for _, annotations in data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        #dropout = decaying(0.6, 0.2, 1e-4)
        dropout = decaying(0.2, 1e-4)

        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            losses = {}
            random.shuffle(data)
            batches = minibatch(data, size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                examples = []
                for text, annotation in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotation)
                    examples.append(example)
                nlp.update(examples, losses=losses)
            print("Iteration:", itn, "Loss:", losses)
    return nlp

In [15]:
trained_model = train_spacy(train_spacy_format, 100)

Starting iteration 0
Iteration: 0 Loss: {'ner': 5018.20619669932}
Starting iteration 1
Iteration: 1 Loss: {'ner': 848.4883044567185}
Starting iteration 2
Iteration: 2 Loss: {'ner': 671.3910017149663}
Starting iteration 3
Iteration: 3 Loss: {'ner': 591.3945935883739}
Starting iteration 4
Iteration: 4 Loss: {'ner': 524.2481268858842}
Starting iteration 5
Iteration: 5 Loss: {'ner': 479.453393221378}
Starting iteration 6
Iteration: 6 Loss: {'ner': 415.8615920955134}
Starting iteration 7
Iteration: 7 Loss: {'ner': 376.18495290675406}
Starting iteration 8
Iteration: 8 Loss: {'ner': 355.44691994044007}
Starting iteration 9
Iteration: 9 Loss: {'ner': 345.2691531944241}
Starting iteration 10
Iteration: 10 Loss: {'ner': 346.1447856802871}
Starting iteration 11
Iteration: 11 Loss: {'ner': 314.1096225091985}
Starting iteration 12
Iteration: 12 Loss: {'ner': 300.77310641003055}
Starting iteration 13
Iteration: 13 Loss: {'ner': 268.2014066352953}
Starting iteration 14
Iteration: 14 Loss: {'ner': 274

In [16]:
#modelfile = "spacy_model_cnn"
#trained_model.to_disk(modelfile)

modelfile = "spacy_model_cnn_mod"
trained_model.to_disk(modelfile)

## Avaliar o modelo

In [17]:
modelfile_load = spacy.load('spacy_model_cnn_mod') 

In [18]:
def evaluate(modelfile, ACC):
    nlp = modelfile
    examples = []
    for input_, annot in ACC:
        #print(input)
        doc = nlp.make_doc(input_)
        example = Example.from_dict(doc, annot)
        examples.append(example)
    scorer = nlp.evaluate(examples)
    return scorer

In [19]:
#results = evaluate(trained_model, test_spacy_format)
results = evaluate(modelfile_load, test_spacy_format)

In [9]:
print(results)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.834375, 'ents_r': 0.8042168674698795, 'ents_f': 0.8190184049079755, 'ents_per_type': {'ACHADO': {'p': 0.8928571428571429, 'r': 0.8823529411764706, 'f': 0.8875739644970414}, 'CALCIFICAÇÃO': {'p': 0.95, 'r': 0.9047619047619048, 'f': 0.9268292682926829}, 'LOCALIZAÇÃO': {'p': 0.8292682926829268, 'r': 0.7816091954022989, 'f': 0.8047337278106508}, 'TAMANHO': {'p': 0.7790697674418605, 'r': 0.7127659574468085, 'f': 0.7444444444444445}, 'ATENUAÇÃO': {'p': 0.7647058823529411, 'r': 0.7222222222222222, 'f': 0.7428571428571428}, 'BORDAS': {'p': 0.5454545454545454, 'r': 1.0, 'f': 0.7058823529411764}}, 'speed': 33381.680957052486}


In [20]:
spacy_results = pd.DataFrame.from_dict(results)

In [11]:
spacy_results.head()

Unnamed: 0,token_acc,token_p,token_r,token_f,ents_p,ents_r,ents_f,ents_per_type,speed
ACHADO,1.0,1.0,1.0,1.0,0.834375,0.804217,0.819018,"{'p': 0.8928571428571429, 'r': 0.8823529411764...",33381.680957
ATENUAÇÃO,1.0,1.0,1.0,1.0,0.834375,0.804217,0.819018,"{'p': 0.7647058823529411, 'r': 0.7222222222222...",33381.680957
BORDAS,1.0,1.0,1.0,1.0,0.834375,0.804217,0.819018,"{'p': 0.5454545454545454, 'r': 1.0, 'f': 0.705...",33381.680957
CALCIFICAÇÃO,1.0,1.0,1.0,1.0,0.834375,0.804217,0.819018,"{'p': 0.95, 'r': 0.9047619047619048, 'f': 0.92...",33381.680957
LOCALIZAÇÃO,1.0,1.0,1.0,1.0,0.834375,0.804217,0.819018,"{'p': 0.8292682926829268, 'r': 0.7816091954022...",33381.680957


In [21]:
#spacy_results.to_csv("results_spacy_model.csv")
spacy_results.to_csv("results_spacy_model_mod.csv")

## Teste de Código

In [None]:
text, entities = training_data_v2[2]

In [None]:
entities = entities['entities']

In [None]:
tags = spacy.training.offsets_to_biluo_tags(modelfile_load.make_doc(text), entities)
tokens = text.split()

In [None]:
for i in range (len(tokens)):
    print("TOKEN {}            TAG {}".format(tokens[i], tags[i]))

In [None]:
nlp_test = spacy.blank("pt")  
ner_test = nlp_test.add_pipe("ner")
for _, annotations in training_data_v2:
    for ent in annotations.get("entities"):
        #print(ent[2])
        ner_test.add_label(ent[2])
other_pipes = [pipe for pipe in nlp_test.pipe_names if pipe != "ner"]

In [None]:
with nlp_test.disable_pipes(*other_pipes):
    optimizer = nlp_test.begin_training()
    #dropout = decaying(0.6, 0.2, 1e-4)
    dropout = decaying(0.2, 1e-4)

    for itn in range(1):
        print("Starting iteration " + str(itn))
        losses = {}
        random.shuffle(training_data_v2)
        batches = minibatch(training_data_v2, size=compounding(4.0, 32.0, 1.001))

        for batch in batches:
            examples = []
            for text, annotation in batch:
                doc = nlp_test.make_doc(text)
                annotation = annotation["entities"][0]
                offsets_to_biluo_tags(nlp_test.make_doc(text), annotation)
                example = Example.from_dict(doc, annotation)
                ner_tags = example.get_aligned_ner()
                
                print(annotation)
                

                print(text)
                print(doc)
                print(ner_tags)

                print(len(text.split()))
                #print(doc.length())
                print(len(ner_tags))

                break
                examples.append(example)
            break
            nlp_test.update(examples, losses=losses)
        print("Iteration:", itn, "Loss:", losses)