# Named entity recognition with Spacy


## Download libraries and models

In [1]:
import numpy as np
import pandas as pd
import spacy
import json
#https://www.newscatcherapi.com/blog/train-custom-named-entity-recognition-ner-model-with-spacy-v3
#https://spacy.io/usage/training#quickstart
#ner annotator: 'https://tecoholic.github.io/ner-annotator/'

## Load training data 

In [2]:
nlp = spacy.load('en_core_web_sm')
with open('NER1.json', 'r') as f:
    data = json.load(f)

In [3]:
training_data = {}
training_data['classes'] = data['classes']
training_data['annotations'] = dict(text = data['annotations'][0][0], entities = data['annotations'][0][1]['entities'])

## Load model

In [4]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [5]:
from spacy.util import filter_spans

In [6]:
text = training_data['annotations']['text']
labels = training_data['annotations']['entities']
doc = nlp.make_doc(text)
ents = []
for start, end, label in labels:
    span = doc.char_span(start, end, label=label, alignment_mode="contract")
    if span is None:
        print("Skipping entity")
    else:
        ents.append(span)
filtered_ents = filter_spans(ents)
doc.ents = filtered_ents
doc_bin.add(doc)
doc_bin.to_disk("training_data.spacy") # save the docbin object

Skipping entity


## Train model by terminal

In [7]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    644.66    0.00    0.00    0.00    0.00
200     200       5714.89  21975.95   99.53   99.07  100.00    1.00
400     400         30.26     87.75  100.00  100.00  100.00    1.00
600     600         32.40      8.50  100.00  100.00  100.00    1.00
800     800         13.92      4.14  100.00  100.00  100.00    1.00
1000    1000         76.56     15.91  100.00  100.00  100.00    1.00
1200    1200        148.79     19.86  100.00  100.00  100.00    1.00
1400    1400         99.84     10.51  100.00  100.00  100.00    1.00
1600    1600         84.17     17.83  100.00  100.00  100.00    1.00
1800    1800        401.50     36.00  100.00  1

## Test new model 

In [2]:
nlp_ner = spacy.load("model-best")

In [3]:
df = pd.read_csv(r"D:\Eclipse\sele-java\blockchain.csv")
df = df['Content']
df[0]

'Introduction This blog post will discuss the step from blockchain generation 3.0 to 4.0 and how DA (Data Analytics) and ML (Machine Learning) can augment blockchain-built solutions. The ﬁrst generation of Blockchain was mainly around the use of basic capabilities and cryptocurrencies or Fintech use, whilst in the second generation, application logic was added in the form of code-based smart contracts to widen its applicability. The third generation was much more about scalability, interoperability and creating good user interfaces to level up with existing business applications. Today, we are looking at generation 4.0, focusing on cross-industry adoption and making enterprise blockchain more usable in real-life business[1]. This raises the question of how more value can be added to these blockchain applications to meet business user expectations. With the growing adoption, the volume of transactions is growing rapidly and a whole new data lake of information is created. Leveraging thi

In [4]:
from spacy import displacy
test = df[3]
doc = nlp_ner(test)

tuple

In [5]:
ans = {}
for ent in doc.ents:
    ans[ent.label_] = []
for ent in doc.ents:
    ans[ent.label_].append((ent.start_char, ent.end_char))
for label in ans: 
    print("\n" + label + ": ")
    for text in ans[label]: 
        print(text, end= "; ")


#save to json file 
file_name = "Predict_NER3.json" 
with open(file_name, 'w') as f: 
    json.dump(ans,f)


BLOCKCHAIN_TECH: 
(274, 281); (409, 419); (691, 701); (1292, 1296); (1310, 1321); (2157, 2164); (2493, 2497); (2640, 2650); (2790, 2800); (3072, 3082); (3462, 3472); (3507, 3517); (4367, 4377); (5154, 5164); (5403, 5413); (5563, 5568); (5795, 5800); 
PERSON: 
(349, 369); (1641, 1645); (1907, 1911); (2083, 2092); (3240, 3248); (4846, 4859); 
DATA_AI: 
(1049, 1053); (1554, 1561); (2530, 2544); (2903, 2914); (3610, 3615); (3616, 3623); (3625, 3633); (3690, 3697); (4245, 4254); (6027, 6041); 
ORDINAL: 
(3821, 3826); 