In [None]:
import spacy
from spacy.tokenizer import Tokenizer
from gliner_spacy.pipeline import GlinerSpacy
import re
import os


In [3]:
sentences = ["Hello my world", "good morning my friend", "The color dark blue is a shade of standard blue"]

In [8]:
def preprocess_text(list_texts):
    
    custom_spacy_config = {
    "gliner_model": "urchade/gliner_small-v2.1",
    "labels": ["Greetings","Colors","Adjectives"],
    "style": "ent",
    "threshold": 0.3
    }
    
    # Load the language model
    nlp = spacy.blank("en")
    nlp.tokenizer = Tokenizer(nlp.vocab,token_match=re.compile(r'\S+').match)
    nlp.add_pipe("gliner_spacy", config=custom_spacy_config)
    docs = list(nlp.pipe(list_texts))
    
    return docs

In [9]:
def split_entities(doc):
    """
    input: doc generated by a spacy pipeline
    output: list of list containing each sentences split by entity or word
    """

    sentence = []
    last_id = 0
    for token in doc:
        if (token.is_stop) & (token.ent_type == 0):
            last_id = 0
        else:
            if (token.ent_type != 0) & (last_id == token.ent_type):
                sentence[-1] = sentence[-1] + f" {token.text}"
            else :
                sentence.append(token.text)
            last_id = token.ent_type
    return sentence

In [10]:
docs = preprocess_text(sentences)
sentences_processed = list(map(split_entities, docs))


In [11]:
sentences_processed

[['Hello', 'world'],
 ['good morning', 'friend'],
 ['color', 'dark blue', 'shade', 'standard blue']]