<a href="https://www.kaggle.com/code/wassimchouchen/ner-pos-tag-and-embedding?scriptVersionId=97005607" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.style as stl 
from tqdm import tqdm
from tensorflow import keras 
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
import string
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Dense, LSTM, Embedding,Dropout,SpatialDropout1D,Conv1D,MaxPooling1D,GRU,BatchNormalization
import warnings
warnings.filterwarnings('ignore')
stl.use('ggplot')

In [2]:
data=pd.read_csv("../input/sample/sample-2.csv")

In [3]:
data.head()

Unnamed: 0,document_id,status,document_url,media_type,media_link,relative_media_path,is_media_downloaded,transcription_text,transcription_text_relative_path,from_time,to_time,project_id,Category,Unnamed: 13,Unnamed: 14
0,NMAAHC-007676813_00577,Complete,https://transcription.si.edu/transcribe/32203/...,image,https://ids.si.edu/ids/deliveryService?max_w=&...,/32203/image/NMAAHC-007676813_00577.jpg,1,"8 E encl\r\n\r\nI Anne H. Elliott, widow do so...",/32203/transcriptions/NMAAHC-007676813_00577.txt,\N,\N,32203,Oaths/Pardons,,
1,NMAAHC-007676818_00567,Complete,https://transcription.si.edu/transcribe/32796/...,image,https://ids.si.edu/ids/deliveryService?max_w=&...,/32796/image/NMAAHC-007676818_00567.jpg,1,I E C Whaley solemnly swear that I have carefu...,/32796/transcriptions/NMAAHC-007676818_00567.txt,\N,\N,32796,Oaths/Pardons,,
2,NMAAHC-007676815_00488-000001,Complete,https://transcription.si.edu/transcribe/32218/...,image,https://ids.si.edu/ids/deliveryService?max_w=&...,/32218/image/NMAAHC-007676815_00488-000001.jpg,1,43M\r\nI Mrs E Montgomery solemnly Swear that ...,/32218/transcriptions/NMAAHC-007676815_00488-0...,\N,\N,32218,Oaths/Pardons,,
3,NMAAHC-007677400_00375,Complete,https://transcription.si.edu/transcribe/26149/...,image,https://ids.si.edu/ids/deliveryService?max_w=&...,/26149/image/NMAAHC-007677400_00375.jpg,1,"No. 2324\r\nI do solemnly swear or affirm, in ...",/26149/transcriptions/NMAAHC-007677400_00375.txt,\N,\N,26149,Oaths/Pardons,,
4,NMAAHC-007677378_00480,Complete,https://transcription.si.edu/transcribe/27345/...,image,https://ids.si.edu/ids/deliveryService?max_w=&...,/27345/image/NMAAHC-007677378_00480.jpg,1,August 24. 1865\r\nAPPLICATION AND CERTIFICATE...,/27345/transcriptions/NMAAHC-007677378_00480.txt,\N,\N,27345,Oaths/Pardons,,


In [4]:
text=data["transcription_text"]

*extract transcription_text from the dataset*

In [5]:
text.head()

0    8 E encl\r\n\r\nI Anne H. Elliott, widow do so...
1    I E C Whaley solemnly swear that I have carefu...
2    43M\r\nI Mrs E Montgomery solemnly Swear that ...
3    No. 2324\r\nI do solemnly swear or affirm, in ...
4    August 24. 1865\r\nAPPLICATION AND CERTIFICATE...
Name: transcription_text, dtype: object

*trying nltk*

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

extract token from each paragraph

In [7]:
tokens=[word for word in nltk.word_tokenize(text[1])]

In [8]:
tokens[:10]

['I',
 'E',
 'C',
 'Whaley',
 'solemnly',
 'swear',
 'that',
 'I',
 'have',
 'carefully']

**position tag of each token**

In [9]:
 word_tag = nltk.pos_tag(tokens)

In [10]:
word_tag[:10]

[('I', 'PRP'),
 ('E', 'VBP'),
 ('C', 'NNP'),
 ('Whaley', 'NNP'),
 ('solemnly', 'RB'),
 ('swear', 'VBP'),
 ('that', 'IN'),
 ('I', 'PRP'),
 ('have', 'VBP'),
 ('carefully', 'RB')]

****

**transforming the tree tag to column tag and adding iob (in, out, begin) tag**

In [11]:
from nltk.chunk import  tree2conlltags
iob_tag = tree2conlltags(word_tag)
iob_tag[:5]

[('I', 'PRP', 'O'),
 ('E', 'VBP', 'O'),
 ('C', 'NNP', 'O'),
 ('Whaley', 'NNP', 'O'),
 ('solemnly', 'RB', 'O')]

**creating dataframe from extracted entities and their position tag  for using it later**

In [12]:
l1=[]
l2=[]
l3=[]
for i in range(20):
    l1.append(iob_tag[i][0])
    l2.append(iob_tag[i][1])
    l3.append(iob_tag[i][2])

In [13]:
df=pd.DataFrame({"entities":l1, "pos_tag": l2, "iob_tag":l3})

In [14]:
df.head()

Unnamed: 0,entities,pos_tag,iob_tag
0,I,PRP,O
1,E,VBP,O
2,C,NNP,O
3,Whaley,NNP,O
4,solemnly,RB,O


**trying other method like ne_chunk , it can be more accurate and helpful in term of identifying of PERSON ORGANIZATION and GPE**

In [15]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(text[10])))
print(ne_tree[:10])

[('[', 'NN'), ('[', 'NNP'), ('image', 'NN'), ('-', ':'), ('eagle', 'NN'), (']', 'NN'), (']', 'NNP'), ('AMNESTY', 'NNP'), ('OATH', 'NNP'), ('.', '.')]


# Spacy

*SpaCy’s named entity recognition has been trained on the OntoNotes 5 corpus and it supports the following entity types:*

![](https://miro.medium.com/max/1400/1*qQggIPMugLcy-ndJ8X_aAA.png)

importing the necessary libraries

In [16]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

**trying to extract all useful information about the entity in text  from Doc(nlp(text)**

In [17]:
L1=[]
def NER (txt):
    for i in range(20):
        nlp_text=nlp(txt[i])
        L1.append([(word.text, word.label_,spacy.explain(word.label_)) for word in nlp_text.ents])

In [18]:
L2=[]
def NER1 (txt):
    for i in range(20):
        nlp_text=nlp(txt[i])
        L2.append([(word, word.ent_iob_, word.ent_type_, word.pos) for word in nlp_text])

In [19]:
NER(text)
NER1(text)

In [20]:
L1[0]

[('Anne H. Elliott', 'PERSON', 'People, including fictional'),
 ('Amnesty', 'ORG', 'Companies, agencies, institutions, etc.'),
 ('Andrew Johnson', 'PERSON', 'People, including fictional'),
 ('the United States of America', 'GPE', 'Countries, cities, states'),
 ('May 29th 1865', 'DATE', 'Absolute or relative dates or periods'),
 ('one', 'CARDINAL', 'Numerals that do not fall under another type'),
 ('fourteen', 'CARDINAL', 'Numerals that do not fall under another type'),
 ('1865', 'DATE', 'Absolute or relative dates or periods'),
 ('S. Wilde Durson', 'PERSON', 'People, including fictional'),
 ('Magt', 'GPE', 'Countries, cities, states'),
 ('2 cent', 'MONEY', 'Monetary values, including unit'),
 ('Anne H Elliott', 'PERSON', 'People, including fictional'),
 ('United States of America', 'GPE', 'Countries, cities, states'),
 ('Anne H Elliott', 'PERSON', 'People, including fictional'),
 ('the District of Beaufort', 'GPE', 'Countries, cities, states'),
 ('State', 'ORG', 'Companies, agencies, i

**structered the extracted information in a dataframe for using it in the next stage**

In [21]:
l0=[]
l1=[]
l2=[]
for i in range(len(L1)):
    for j in range(len(L1[i])):
        l0.append(L1[i][j][0])
        l1.append(L1[i][j][1])
        l2.append(L1[i][j][2])
   

In [22]:
df1=pd.DataFrame({"entities":l0, "label": l1, "explain":l2, "frequency":""})
df1.head()


Unnamed: 0,entities,label,explain,frequency
0,Anne H. Elliott,PERSON,"People, including fictional",
1,Amnesty,ORG,"Companies, agencies, institutions, etc.",
2,Andrew Johnson,PERSON,"People, including fictional",
3,the United States of America,GPE,"Countries, cities, states",
4,May 29th 1865,DATE,Absolute or relative dates or periods,


In [23]:
df1.shape

(394, 4)

**adding the frequency of each entities in the text maybe it may help in improving the model later**

In [24]:
Counter(df1["entities"][:10])

Counter({'Anne H. Elliott': 1,
         'Amnesty': 1,
         'Andrew Johnson': 1,
         'the United States of America': 1,
         'May 29th 1865': 1,
         'one': 1,
         'fourteen': 1,
         '1865': 1,
         'S. Wilde Durson': 1,
         'Magt': 1})

In [25]:
dict=Counter(df1["entities"])

In [26]:
for i in range(df1.shape[0]):
    df1["frequency"][i]=dict[df1["entities"].loc[i]]

In [27]:
df1

Unnamed: 0,entities,label,explain,frequency
0,Anne H. Elliott,PERSON,"People, including fictional",1
1,Amnesty,ORG,"Companies, agencies, institutions, etc.",2
2,Andrew Johnson,PERSON,"People, including fictional",2
3,the United States of America,GPE,"Countries, cities, states",7
4,May 29th 1865,DATE,Absolute or relative dates or periods,1
...,...,...,...,...
389,Warren Co Miss,PERSON,"People, including fictional",1
390,1865,DATE,Absolute or relative dates or periods,14
391,Frank E. Miller,PERSON,"People, including fictional",2
392,66th,ORDINAL,"""first"", ""second"", etc.",3


In [28]:
df1["entities"].loc[1]

'Amnesty'

In [29]:
L2[1][:5]

[(I, 'O', '', 95),
 (E, 'O', '', 92),
 (C, 'O', '', 96),
 (Whaley, 'B', 'PERSON', 96),
 (solemnly, 'O', '', 86)]

In [30]:
L2[0][:10]

[(8, 'O', '', 93),
 (E, 'O', '', 92),
 (encl, 'O', '', 92),
 (
  , 'O', '', 103),
 (I, 'O', '', 100),
 (Anne, 'B', 'PERSON', 96),
 (H., 'I', 'PERSON', 96),
 (Elliott, 'I', 'PERSON', 96),
 (,, 'O', '', 97),
 (widow, 'O', '', 92)]

In [31]:
l1=[]
l2=[]
l3=[]
for i in range(20):
    for j in range(len(L2[i])):
        l1.append(L2[i][j][0])
        l2.append(L2[i][j][1])
        l3.append(L2[i][j][2])

In [32]:
df2=pd.DataFrame({"entities":l1, "iob_tag": l2, "label":l3, "pos_tag": ""})
df2.head(10)


Unnamed: 0,entities,iob_tag,label,pos_tag
0,8,O,,
1,E,O,,
2,encl,O,,
3,\r\n\r\n,O,,
4,I,O,,
5,Anne,B,PERSON,
6,H.,I,PERSON,
7,Elliott,I,PERSON,
8,",",O,,
9,widow,O,,


**adding pos_tag to the dataframe**

In [33]:
L=[]
for i in range(df2.shape[0]) :
    x = df2["entities"].loc[i]
    L.append((x.pos_))
    df2["pos_tag"].loc[i]=L[i]

In [34]:
df2

Unnamed: 0,entities,iob_tag,label,pos_tag
0,8,O,,NUM
1,E,O,,NOUN
2,encl,O,,NOUN
3,\r\n\r\n,O,,SPACE
4,I,O,,VERB
...,...,...,...,...
3761,\r\n\r\n,O,,SPACE
3762,[,O,,X
3763,DUPLICATE,O,,X
3764,],O,,PUNCT


trying to more understand the use  and the advantages of display_render in spacy 

In [35]:
sentences = [x for x in nlp(text[1]).sents]
print(sentences)

[I E C Whaley solemnly swear that I have carefully read the Amnesty
Proclamation issued by Andrew Johnson, President of the United State of America, on May 29th, 1865, and that I am not excepted from the benefits of the Proclamation by any one of the fourteen exceptions therein made [[Strikethrough]] except [[strikethrough/]] 
E C Whaley
Sworn to and subscribed before me at Charleston SC this 28
day of Sept 1865

Levi Steuben
Maj. 47th P.M.

United States of America
E C Whaley of the city, [[strikethrough]] county [[strikethrough/]] Charleston Sate of South Ca do solemnly swear in the presence of the Almighty God, that I will henceforth faithfully support and defend the Constitution of the United States, and the Union of the States thereunder, and that I will, in like manner, abide by and faithfully support all laws and proclamations which have been made during the existing rebellion with reference to the Emancipation of Slaves-SO HELP ME GOD

E C Whaley
Sworn to and subsc

In [36]:
displacy.render(nlp(str(df1.entities)), style='ent')

In [37]:
displacy.render(nlp(str(sentences)) , style='ent')