The idea of this notebook is to see how easy it is to train an algorithm to find new entities, following the instructions from https://spacy.io/usage/training. For this notebook we will train two entities: vehicle and name.
We will use a dataset created by myself which has very little data (Is my intention to with time add more). This data was created by taking random phrases from wikipedia and news articles.

In [2]:
import pandas as pd
import spacy
import random
from spacy.util import minibatch, compounding
import warnings

#the following function could be easly generalized to allow N entities, but for the time being only allows the two entities mentioned.
def create_training_data(path, lower = False, both = False):
    train_csv = pd.read_csv(path)

    train_csv["vehicle_list"]= train_csv["vehicle"].str.split(",")
    train_csv["vehicle_list"]= train_csv["vehicle_list"].fillna({i: [] for i in train_csv.index})
    train_csv["vehicle_list"] = train_csv["vehicle_list"].apply(lambda x: [y.lstrip() for y in x])
    
    train_csv["name_list"]= train_csv["name"].str.split(",")
    train_csv["name_list"]= train_csv["name_list"].fillna({i: [] for i in train_csv.index})
    train_csv["name_list"] = train_csv["name_list"].apply(lambda x: [y.lstrip() for y in x])
     
    all_locs_vehicle = []
    Label = "VEHICLE"
    for i in range(len(train_csv)):
        location=[]
        for entity in train_csv["vehicle_list"][i]:
            ini = train_csv.loc[i,"text"].find(entity)
            fin = ini + len(entity)
            location.append((ini, fin, Label))
        all_locs_vehicle.append(location)
        
    all_locs_name = []
    Label = "NAME"
    for i in range(len(train_csv)):
        location=[]
        for entity in train_csv["name_list"][i]:
            ini = train_csv.loc[i,"text"].find(entity)
            fin = ini + len(entity)
            location.append((ini, fin, Label))
        all_locs_name.append(location)
         
    all_locs = [a + b for a, b in zip(all_locs_vehicle, all_locs_name)]    
    
    train_csv["location"] = all_locs
    train_csv["train_dict"] = train_csv["location"].apply(lambda x: {"entities":x})
    
    #Convert all the text to lowercase
    if lower:
        train_csv["train_data"] = list(zip(train_csv["text"].str.lower(), train_csv["train_dict"]))
    else:
        train_csv["train_data"] = list(zip(train_csv["text"], train_csv["train_dict"]))

    #Double de data by creating a dataset with the original text plus all the text in lowercase.
    if both:
        train_csv["train_data"] = list(zip(train_csv["text"].str.lower(), train_csv["train_dict"]))
        train_csv["train_data_lower"] = list(zip(train_csv["text"], train_csv["train_dict"]))
        train_data = list(train_csv["train_data"]) + list(train_csv["train_data_lower"])
    
    else:
        train_data = list(train_csv["train_data"])

    return train_data


def train_label(path_data, model=None, lower = False, both = False, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
        
    TRAIN_DATA = create_training_data(path_data,
                                   lower = lower, both = both)
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
    return nlp

Before starting with the training, let's just look at the data the way it comes in the CSV. It is important that the CSV comes this way.

In [7]:
train_csv = pd.read_csv("D:/ner/vehicles_ner.csv")

train_csv.head()

Unnamed: 0,text,vehicle,name
0,"At first, Ford in Germany and Ford in Britain ...","Ford Transit, Ford Escort, Ford Capri",
1,"The operation, branded NedCar, began producing...","Mitsubishi Carisma, Volvo S40/V40",
2,The Volvo T5 petrol engine was used in the For...,"Volvo T5, Ford Focus ST",
3,"In October 2016, Mercedes unveiled the X-Class...",Nissan Navara,
4,The Mercedes-Benz S400 BlueHYBRID was launched...,Mercedes-Benz S400 BlueHYBRID,


In [8]:
train_csv.tail()

Unnamed: 0,text,vehicle,name
63,Multi award-winning designer Gert-Johan Coetze...,,Gert-Johan Coetzee
64,"The deaths of Chadwick Boseman, Kobe Brant and...",,"Chadwick Boseman, Kobe Brant, Naya Rivera"
65,"In a late June interview, singer August Alsina...",,"August Alsina, Jada Pinkett Smith, Will Smith"
66,The Duchess of Sussex has invested in Clevr Bl...,,Hannah Mendoza
67,When Rita Wilson's Golden Globes hair and make...,,"Rita Wilson, Chrissy Teigen"


As it can be seen, the CSV is very simple, it requires one column with text, it can be simple and short.
In the next columns one needs to add the entity that is in the text, written in EXACTLY the same way. If there is more than one entity, then they should be separated by commas.
It is very important than non of the entities is a subset of another. for example "Will Smith" and "Smith" cannot be in the same row.

We will train 4 algorithms: three of them built on top of an existing model "en_core_web_sm", and with the possible combinations of training with capital letters, training only in lowercase, and training doubling the dataset to have each data row both with capitals and in lowercase. The 4 model is built from scratch and with both the lowercase and capital dataset.

In [9]:
nlp1 = train_label(path_data="D:/ner/vehicles_ner.csv", 
                    model="en_core_web_sm", 
                    lower = False, both = False,
                    n_iter=100)  

nlp2 = train_label(path_data="D:/ner/vehicles_ner.csv", 
                    model="en_core_web_sm", 
                    lower = True, both = False,
                    n_iter=100)  

nlp3 = train_label(path_data="D:/ner/vehicles_ner.csv", 
                    model="en_core_web_sm", 
                    lower = True, both = True,
                    n_iter=100)  

nlp4 = train_label(path_data="D:/ner/vehicles_ner.csv", 
                    model=None, 
                    lower = True, both = True,
                    n_iter=100)  

Loaded model 'en_core_web_sm'
68


  gold = GoldParse(doc, **gold)


Losses {'ner': 1768.6686540842056}
Losses {'ner': 1742.1710683107376}
Losses {'ner': 1502.2416961193085}
Losses {'ner': 1475.2510503530502}
Losses {'ner': 1376.4595583677292}
Losses {'ner': 1388.804090499878}
Losses {'ner': 1368.1998279094696}
Losses {'ner': 1322.5616748332977}
Losses {'ner': 1318.2733652591705}
Losses {'ner': 1283.64140021801}
Losses {'ner': 1292.1034083366394}
Losses {'ner': 1275.2201805114746}
Losses {'ner': 1301.2823269367218}
Losses {'ner': 1263.9467059373856}
Losses {'ner': 1245.7258214950562}
Losses {'ner': 1241.980040192604}
Losses {'ner': 1241.4477925300598}
Losses {'ner': 1273.932140827179}
Losses {'ner': 1275.4104261398315}
Losses {'ner': 1226.0862768888474}
Losses {'ner': 1208.425077199936}
Losses {'ner': 1218.3708946704865}
Losses {'ner': 1224.6185513734818}
Losses {'ner': 1229.113537788391}
Losses {'ner': 1207.2272646427155}
Losses {'ner': 1192.838204741478}
Losses {'ner': 1231.2462066411972}
Losses {'ner': 1191.1409285068512}
Losses {'ner': 1196.11169004

  gold = GoldParse(doc, **gold)


Losses {'ner': 1790.6087880134583}
Losses {'ner': 1714.0673192739487}
Losses {'ner': 1571.4035007953644}
Losses {'ner': 1383.2840592861176}
Losses {'ner': 1376.8488439321518}
Losses {'ner': 1354.6153129339218}
Losses {'ner': 1316.193055152893}
Losses {'ner': 1300.7367210388184}
Losses {'ner': 1294.7037624120712}
Losses {'ner': 1270.8861352205276}
Losses {'ner': 1343.9252434372902}
Losses {'ner': 1312.1342871189117}
Losses {'ner': 1277.5457113981247}
Losses {'ner': 1271.1938561201096}
Losses {'ner': 1262.7533793449402}
Losses {'ner': 1228.3470081090927}
Losses {'ner': 1269.2025427818298}
Losses {'ner': 1269.315601348877}
Losses {'ner': 1225.7581905126572}
Losses {'ner': 1205.9436852931976}
Losses {'ner': 1223.5385637879372}
Losses {'ner': 1265.9525990486145}
Losses {'ner': 1219.2801077365875}
Losses {'ner': 1227.6608184576035}
Losses {'ner': 1230.6984890699387}
Losses {'ner': 1213.0778533220291}
Losses {'ner': 1186.899279475212}
Losses {'ner': 1210.1419492959976}
Losses {'ner': 1209.885

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Losses {'ner': 3464.2047604322433}
Losses {'ner': 2917.609674692154}
Losses {'ner': 2753.6114585399628}
Losses {'ner': 2676.9799662828445}
Losses {'ner': 2669.698263168335}
Losses {'ner': 2596.9201174378395}
Losses {'ner': 2526.5871664881706}
Losses {'ner': 2510.0561252832413}
Losses {'ner': 2497.633808374405}
Losses {'ner': 2530.214335203171}
Losses {'ner': 2491.6720004081726}
Losses {'ner': 2448.249148964882}
Losses {'ner': 2443.9571385383606}
Losses {'ner': 2469.6788225769997}
Losses {'ner': 2421.3908796310425}
Losses {'ner': 2461.7745512723923}
Losses {'ner': 2382.7855271697044}
Losses {'ner': 2450.437362074852}
Losses {'ner': 2436.4655747413635}
Losses {'ner': 2386.881347298622}
Losses {'ner': 2359.5336021780968}
Losses {'ner': 2363.2993045449257}
Losses {'ner': 2364.3559769392014}
Losses {'ner': 2339.355598807335}
Losses {'ner': 2347.350609242916}
Losses {'ner': 2342.3463760614395}
Losses {'ner': 2387.7337332069874}
Losses {'ner': 2334.444981813431}
Losses {'ner': 2371.3231178522

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


Losses {'ner': 949.9281453204676}
Losses {'ner': 436.29655079956865}
Losses {'ner': 420.23687518074075}
Losses {'ner': 444.01948833395363}
Losses {'ner': 395.4580296885058}
Losses {'ner': 507.65424433472435}
Losses {'ner': 368.9197546825548}
Losses {'ner': 537.7458909839859}
Losses {'ner': 268.7701371210194}
Losses {'ner': 404.74303240867687}
Losses {'ner': 252.86074340940758}
Losses {'ner': 231.02604607614012}
Losses {'ner': 228.08295443648436}
Losses {'ner': 200.91086739660972}
Losses {'ner': 148.48556680731565}
Losses {'ner': 146.95419746052318}
Losses {'ner': 139.9613335808985}
Losses {'ner': 91.41838542402083}
Losses {'ner': 83.45099807350645}
Losses {'ner': 93.20085436688632}
Losses {'ner': 67.74017521927635}
Losses {'ner': 66.69221076459806}
Losses {'ner': 62.254582596098125}
Losses {'ner': 82.1155483535566}
Losses {'ner': 46.84873331458119}
Losses {'ner': 39.354900051496806}
Losses {'ner': 55.21637533534718}
Losses {'ner': 37.42706473394224}
Losses {'ner': 27.717711873082706}
L

In [10]:
test_docs = ["I was driving a Mercedes-Benz P53",
             "I was driving a mercedes-menz P53",
             "I was driving a Aston-Martin P53",
             "I was driving a Aston Martin P53",
             "I was driving a Corolla Pizza",
             "I was eating a Dominos Pizza",
             "I was eating a Pizza",
             "Daniel´s Pizza is the best",
             "My name is Daniel Wegman I was born in Mexico City"] 

In [11]:
for doc in test_docs:
    doc = nlp1(doc)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Mercedes-Benz P53', 'VEHICLE')]
Entities []
Entities [('Aston-Martin P53', 'VEHICLE')]
Entities [('Aston Martin', 'VEHICLE')]
Entities [('Corolla Pizza', 'VEHICLE')]
Entities [('Dominos Pizza', 'VEHICLE')]
Entities []
Entities []
Entities [('Daniel Wegman', 'NAME')]


In [12]:
for doc in test_docs:
    doc = nlp2(doc)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Mercedes-', 'VEHICLE')]
Entities [('mercedes-', 'VEHICLE')]
Entities []
Entities []
Entities [('Corolla Pizza', 'VEHICLE')]
Entities []
Entities []
Entities []
Entities [('Daniel Wegman', 'NAME')]


In [13]:
for doc in test_docs:
    doc = nlp3(doc)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Mercedes-Benz P53', 'VEHICLE')]
Entities [('mercedes-menz P53', 'VEHICLE')]
Entities [('Martin', 'VEHICLE')]
Entities []
Entities [('Corolla', 'VEHICLE')]
Entities []
Entities []
Entities []
Entities [('Daniel Wegman', 'NAME')]


In [14]:
for doc in test_docs:
    doc = nlp4(doc)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Mercedes-Benz', 'VEHICLE')]
Entities [('mercedes-menz', 'VEHICLE')]
Entities []
Entities [('Martin', 'VEHICLE')]
Entities [('Corolla Pizza', 'VEHICLE')]
Entities [('Pizza', 'VEHICLE')]
Entities [('Pizza', 'VEHICLE')]
Entities [('Daniel´s Pizza', 'NAME')]
Entities [('City', 'VEHICLE')]


It is very clear that since at this point there is not enough data, the training was not completely successful, but it can be seen how the algorithm already started to learn even with this small amount of information. Also it is a better idea to train with both capitalizacion and all in lowercase so the algorithm does not learn just to get entities because they have a capital letter.