<a href="https://colab.research.google.com/github/vasudevmalusare/AWS_ML_Projects/blob/main/Custom_NER_with_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -U spacy -q 

In [2]:
!python -m spacy info

2023-06-01 03:14:17.969006: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[1m

spaCy version    3.5.3                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-5.15.107+-x86_64-with-glibc2.31
Python version   3.10.11                       
Pipelines        en_core_web_sm (3.5.0)        



In [3]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

In [4]:
train_Path = "/content/training data.json"
test_path = "/content/testing data.json"

In [5]:
import json
f = open(train_Path)
TRAIN_DATA = json.load(f)

f2 = open(test_path)
TEST_DATA = json.load(f2)

In [6]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

db_test = DocBin()
for text, annot in tqdm(TEST_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db_test.add(doc)
db_test.to_disk("./testing_data.spacy")

100%|██████████| 239/239 [00:00<00:00, 2182.82it/s]
100%|██████████| 61/61 [00:00<00:00, 2786.70it/s]


In [7]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency 

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./testing_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-06-01 03:14:50,926] [INFO] Set up nlp object from config
[2023-06-01 03:14:50,945] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-06-01 03:14:50,950] [INFO] Created vocabulary
[2023-06-01 03:14:50,950] [INFO] Finished initializing nlp object
[2023-06-01 03:14:51,248] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     39.39    1.55    1.29    1.94    0.02
  2     200         46.57   1040.54   99.27   99.51   99.03    0.99
  4     400        118.18     82.76   99.76  100.00   99.51    1.00
  7     600         29.28     48.06   99.76  100.00   99.51    1.00
 11     800         37.91     58.34   99.76  100.00   99.51    

In [9]:
nlp_ner = spacy.load("/content/model-best") 

In [10]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from string import punctuation

In [11]:
df = pd.read_excel('/content/NLP Engineer - Question 1.xlsx', sheet_name='Data-Pending',header=None)
df.rename({0:'pending'},inplace = True, axis=1)
df.head(3)

Unnamed: 0,pending
0,In recognition of their exceptional performanc...
1,"Eldridge Holloway, your outstanding performanc..."
2,In recognition of their exceptional performanc...


In [33]:
def clean_text(data):
 tokam = RegexpTokenizer(r'[,a-zA-Z]+|[0-9a-z]+')
 return tokam.tokenize(data)


def clean_data(data):
 clean_value = [word for word in data if (word not in punctuation)
 and len(word) > 2 ]
 return clean_value

def join_data(data):
  res = ' '.join(data)
  return res

In [35]:
clean = df.pending.apply(clean_text)[0]
clean = clean_data(clean)
print(clean)
clean = join_data(clean)
print(clean)

doc = nlp_ner(clean)
print(doc.ents)

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

['recognition', 'their', 'exceptional', 'performance', 'the', 'Athletics,', 'Erwin', 'Harper', 'has', 'been', 'awarded', 'the', '1st', 'Position', 'the', '2010', 'competition', 'extend', 'our', 'hearty', 'congratulations', 'and', 'best', 'wishes', 'for', 'all', 'future', 'endeavors']
recognition their exceptional performance the Athletics, Erwin Harper has been awarded the 1st Position the 2010 competition extend our hearty congratulations and best wishes for all future endeavors
(Athletics, Erwin Harper, 1st, 2010)


In [39]:
clean = df.pending.apply(clean_text)
clean = clean.apply(clean_data)
clean = clean.apply(join_data)

In [41]:
len(clean)

452

In [37]:
from spacy import displacy

In [44]:
for i in range(len(clean)):  

  doc = nlp_ner(clean[i])
  for ent in doc.ents:
    # if ent.label_ == 'SPORT NAME':
    displacy.render(nlp_ner(ent.text), style="ent", jupyter=True)

In [18]:
# for wr in clean:
#   doc = nlp_ner(wr) # input pending data
#   ent = doc.ents:
#   if ent.label_ == 'SPORT NAME' or ent.label_ == 'GPE':
#         spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter


In [42]:
spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [45]:
for wr in clean:
  doc = nlp_ner(wr) # input pending data
  spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter