In [None]:
# Libraries 

import re
import json
import random
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import nltk
import spacy
from spacy.tokens import DocBin

spacy.require_gpu()

In [None]:
# Models 

# Read nlp model
nlp = spacy.blank("en")

In [3]:
# Functions 

def start_end_finder(string, substring): 
    
    matches = re.finditer(substring, string)
    
    matches_info = [(match.start(), match.end()) for match in matches]
    
    return matches_info

def get_span(doc, indices): 
    
    # Ensure indices are provided
    if not indices:
        return (None, None)

    # Get the starting character index of the first token in the list
    start_char = doc[indices[0]].idx

    # Get the ending character index of the last token in the list
    # This is the starting character index of the last token + its length
    end_char = doc[indices[-1]].idx + len(doc[indices[-1]])

    return (start_char, end_char)

#### Custom data 

In [None]:
# Read triples 

with open('formatted_triples.pickle', 'rb') as handle: 
    
    triples  = pickle.load(handle)

In [None]:
random.shuffle(triples)

In [None]:
# Format data 

custom_rows = []
for triple in tqdm(triples[:30000]): 

    try: 
        doc = triple['sentence']
        text = doc.text

        subj_start_end = get_span(doc, triple['subj'])
        subj_start_end = (subj_start_end[0], subj_start_end[1], 'subj')

        rel_start_end = get_span(doc, triple['rel'])
        rel_start_end = (rel_start_end[0], rel_start_end[1], 'rel')

        obj_start_end = get_span(doc, triple['obj'])
        obj_start_end = (obj_start_end[0], obj_start_end[1], 'obj')
        
        entities_dict = {"entities":[subj_start_end, rel_start_end, obj_start_end]}
        
        for mod_indices in triple['mods']: 
            
            mod_start_end = get_span(doc, mod_indices)
            mod_start_end = (mod_start_end[0], mod_start_end[1], 'mod')
            entities_dict['entities'].append(mod_start_end)

        row = (text, entities_dict)

        custom_rows.append(row)
        
    except: 
        pass
    
random.shuffle(custom_rows)

#### Train Model 

https://www.analyticsvidhya.com/blog/2022/06/custom-named-entity-recognition-using-spacy-v3/

In [None]:
# Setup dataset to train 

db = DocBin()
fails = []
index = 0
for text, annotations in tqdm(custom_rows[:20000]): 
    
    try:
        text = text.encode('ascii', 'ignore').decode("utf-8") 
        doc = nlp(text)

        ents = []
        for start, end, label in annotations['entities']: 

            span = doc.char_span(start, end, label = label)
            ents.append(span)

        doc.ents = ents
        db.add(doc)
        
    except:
        fails.append(index)
        
    index += 1

db.to_disk("./train.spacy")

In [5]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
!python -m spacy train config.cfg --output ./test_2 --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;2m✔ Created output directory: test_2[0m
[38;5;4mℹ Saving to output directory: test_2[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should p