In [3]:
import json
from tqdm import tqdm

import spacy
from spacy.util import decaying
from spacy.util import minibatch, compounding
from spacy.training import offsets_to_biluo_tags
from spacy.training.example import Example
from spacy.training import biluo_tags_to_offsets
from spacy.training import offsets_to_biluo_tags
from spacy.scorer import Scorer


import pandas as pd
import random
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

In [4]:
print(spacy.__version__)

3.7.4


## Converter Formato DataFrame to Spacy

In [11]:
df_train = pd.read_csv("data/df_train_tokens_labeled_iob_bert_format_full.csv", encoding="utf-8")
df_test = pd.read_csv("data/df_test_tokens_labeled_iob_bert_format_full.csv", encoding="utf-8")

In [12]:
def bert_2_spacy_format(df):
    spacy_format_data = []
    for _, row in df.iterrows():
        text = row['text']
        entities = []
        tokens = row['text'].split()
        labels = row['iob_labels'].split()
        start = 0
        entity_label = None
        for token, label in zip(tokens, labels):
            end = start + len(token)
            if label != 'O':
                label = label.replace('B-', '').replace('I-', '').upper()
                if label != entity_label:
                    entity_label = label
                    entities.append((start, end, label))
            start = end + 1  
        spacy_format_data.append((text, {"entities": entities}))
    return spacy_format_data

In [13]:
train_spacy_format = bert_2_spacy_format(df_train)
test_spacy_format = bert_2_spacy_format(df_test)

## Treinar o modelo

In [14]:
def train_spacy(data, iterations):
    nlp = spacy.blank("pt")  
    ner = nlp.add_pipe("ner")

    for _, annotations in data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        #dropout = decaying(0.6, 0.2, 1e-4)
        dropout = decaying(0.2, 1e-4)

        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            losses = {}
            random.shuffle(data)
            batches = minibatch(data, size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                examples = []
                for text, annotation in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotation)
                    examples.append(example)
                nlp.update(examples, losses=losses)
            print("Iteration:", itn, "Loss:", losses)
    return nlp

In [None]:
trained_model = train_spacy(train_spacy_format, 100)

In [16]:
modelfile = "spacy_model_cnn_mod"
trained_model.to_disk(modelfile)

## Avaliar o modelo

In [17]:
modelfile_load = spacy.load('spacy_model_cnn_mod') 

In [18]:
def evaluate(modelfile, ACC):
    nlp = modelfile
    examples = []
    for input_, annot in ACC:
        #print(input)
        doc = nlp.make_doc(input_)
        example = Example.from_dict(doc, annot)
        examples.append(example)
    scorer = nlp.evaluate(examples)
    return scorer

In [19]:
results = evaluate(modelfile_load, test_spacy_format)

In [None]:
print(results)

In [20]:
spacy_results = pd.DataFrame.from_dict(results)

In [None]:
spacy_results.head()

In [21]:
spacy_results.to_csv("results_spacy_model_mod.csv")