<img src="http://cfs22.simplicdn.net/ice9/new_logo.svgz "/>

# PG AI - Natural Language Processing and Speech Recognition
# Assisted Practice: Extract City and Person Name from Text

DESCRIPTION

In this demo, we will show you how to create a Custom Named Entity Recognition (NER) using spaCy.<br>
NER  is a subtask of information extraction (IE) that seeks and categorises specified entities in a body or bodies of texts. NER is also simply known as entity identification, entity chunking, and entity extraction.<br>
On the other hand, spaCy is an open-source library for advanced NLP  in Python. It is designed specifically for production and use and helps build applications that process and “understand” large volumes of text. It can be used to build information extraction or natural language understanding systems, or to preprocess text for deep learning.<br>

By Edson Teixeira<br>
teixeiraedson252@gmail.com <br>
December 29th 2021

In [1]:
# Step 1: Import required libraries
import plac
import logging
import argparse
import sys
import json
import os
import json
import pickle

In [2]:
!python -m spacy download en_core_web_md

  File "/usr/local/lib/python3.7/site.py", line 177
    file=sys.stderr)
        ^
SyntaxError: invalid syntax


In [3]:
# Step 2: Data Preprocessing
def tsv_to_json_format(input_path,output_path,unknown_label):
    try:
        f=open(input_path,'r', encoding='cp1252') # input file
        fp=open(output_path, 'w', encoding='cp1252') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity!=unknown_label:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1  
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

tsv_to_json_format("ner_dataset.tsv",'ner_corpus_260_training.json','abc')

In [4]:
# Convert the above data into format needed by spaCy
def main(input_file=None, output_file=None):
        training_data = []
        lines=[]
        with open(input_file, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        #print(training_data)

        with open(output_file, 'w') as fp:
            json.dump(training_data, fp)

main("ner_corpus_260.json","ner_corpus_260_training.json")

In [5]:
# Step 5: Training spaCy NER with Custom Entities
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import en_core_web_sm
import spacy
from spacy.cli.download import download
#download(model="en_core_web_md")
spacy.load("en_core_web_md")

# New entity labels
# Specify the new entity labels which you want to add here
LABEL = ['I-geo', 'B-geo', 'I-art', 'B-art', 'B-tim', 'B-nat', 'B-eve', 'O', 'I-per', 'I-tim', 'I-nat', 'I-eve', 'B-per', 'I-org', 'B-gpe', 'B-org', 'I-gpe']

"""
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
"""
# Loading training data 
with open ('ner_corpus_260_training.json', 'rb') as fp:
    TRAIN_DATA = json.load(fp)


def main(model, new_model_name, output_dir, n_iter=10):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    reset_weights = False    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
        reset_weights = True
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)   # Add new entity labels to entity recognizer

    if model is None or reset_weights:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            
    # Test the trained model
    test_text = 'Gianni Infantino is the president of FIFA.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)


main('en_core_web_md',"new_model","",10)

Loaded model 'en_core_web_md'
Entities in 'Gianni Infantino is the president of FIFA.'
PERSON Gianni Infantino
ORG FIFA
Saved model to .
Loading from .
PERSON Gianni Infantino
ORG FIFA
