In [2]:
# Importing libraries

import spacy
from spacy.tokens import DocBin # DocBin used for tokenization
from tqdm import tqdm           # tqdm ---> Tqdm is a Python library that provides fast, extensible progress bars for loops and iterables. It's a simple way to track the advancement of time-intensive tasks
import re                       # re ---> Regex used for text processing/pattern matching

nlp = spacy.blank("en") # load a new spacy model
db = DocBin()           # create a DocBin object

In [5]:
import json

# Load the training data (json file)

with open("../data/train_annotations.json") as fp:
    train_data = json.load(fp)

train_data

{'classes': ['ORG_YEAR', 'SPORTS_NAME', 'WINNING_POSITION'],
 'annotations': [['Raleigh Flynn, your outstanding performance in the Equestrian Dressage during the 2013 competition has earned you a well-deserved 1st Position. Your sportsmanship and zeal for the game are appreciated. Congratulations!\r',
   {'entities': [[51, 70, 'SPORTS_NAME'],
     [82, 86, 'ORG_YEAR'],
     [130, 133, 'WINNING_POSITION']]}],
  ['Clifford Hall, your outstanding performance in the Boxing during the 2013 competition has earned you a well-deserved 2nd Position. Your sportsmanship and zeal for the game are appreciated. Congratulations!\r',
   {'entities': [[51, 57, 'SPORTS_NAME'],
     [69, 73, 'ORG_YEAR'],
     [117, 120, 'WINNING_POSITION']]}],
  ['In recognition of their exceptional performance in the Rowing, Clifton Hamilton has been awarded the 2nd Position in the 2009 competition. We extend our hearty congratulations and best wishes for all future endeavors!\r',
   {'entities': [[55, 61, 'SPORTS_NAME'

In [6]:
type(train_data)

dict

In [7]:
# Load the Validation data (json file)

with open('../data/test_annotations.json') as fp:
  validation_data = json.load(fp)

validation_data

{'classes': ['ORG_YEAR', 'SPORTS_NAME', 'WINNING_POSITION'],
 'annotations': [['We take immense pleasure in presenting this certificate to Dexter Steele, who has made a remarkable achievement by securing the 1st Position in the Baseball competition, 2012. May you continue to rise and shine!\r',
   {'entities': [[128, 131, 'WINNING_POSITION'],
     [148, 156, 'SPORTS_NAME'],
     [170, 175, 'ORG_YEAR']]}],
  ['In recognition of their exceptional performance in the Beach Volleyball, Marshall Barnes has been awarded the 2nd Position in the 2009 competition. We extend our hearty congratulations and best wishes for all future endeavors!\r',
   {'entities': [[61, 71, 'SPORTS_NAME'],
     [110, 113, 'WINNING_POSITION'],
     [130, 134, 'ORG_YEAR']]}],
  ['Clifford Lamb, your outstanding performance in the Table Tennis during the 2013 competition has earned you a well-deserved 3rd Position. Your sportsmanship and zeal for the game are appreciated. Congratulations!\r',
   {'entities': [[51, 63,

In [9]:
# Converting train data into .spacy format

for text, annot in tqdm(train_data['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

# save the docbin object
db.to_disk("training_data.spacy") 

100%|██████████| 268/268 [00:02<00:00, 123.76it/s]


In [10]:
# Converting test data into .spacy format

for text, annot in tqdm(validation_data['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

# save the docbin object
db.to_disk("validation_data.spacy") 

100%|██████████| 30/30 [00:00<00:00, 365.87it/s]


In [12]:
# Getting the config file for training

!python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [13]:
# Training
# import os 

# os.system("python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy")

!python -m spacy train ./config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy

^C


In [14]:
# Loading best model

nlp_ner = spacy.load("model-best") 

In [15]:
text = "In recognition of their exceptional performance in the Artistic Swimming, Andrew Peterson has been awarded the 2nd Position in the 2009 competition. We extend our hearty congratulations and best wishes for all future endeavors!"

In [16]:
doc = nlp_ner(text)

In [17]:
spacy.displacy.render(doc, style="ent", jupyter=True)