# NAMED ENTITY RECOGNITION:

1. The named entities are pre-defined categories chosen according to the use case such as names of people, organizations, places, codes, time notations, monetary values, etc.

1. NER aims to assign a class to each token (usually a single word) in a sequence. Because of this, NER is also referred to as token classification.

In [2]:
!pip install simpletransformers





In [1]:
import pandas as pd
data = pd.read_csv("ner_dataset.csv",encoding="latin1" )

In [2]:
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [3]:
data =data.fillna(method ="ffill")

In [4]:
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"] )

In [7]:
data.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O
5,0,through,IN,O
6,0,London,NNP,B-geo
7,0,to,TO,O
8,0,protest,VB,O
9,0,the,DT,O


In [8]:
data.rename(columns={"Sentence #":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)

In [9]:
data["labels"] = data["labels"].str.upper()

In [10]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [12]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [13]:
train_data

Unnamed: 0,sentence_id,words,labels
979802,38673,entered,O
757591,27365,portion,O
537788,16231,terrorists,O
624549,20609,the,O
220163,92,his,O
...,...,...,...
439641,11205,after,O
928204,36003,China,B-GEO
731602,26051,Kirkuk,B-GEO
628020,20783,on,O


# Model Training


In [14]:
from simpletransformers.ner import NERModel,NERArgs

2024-02-03 17:44:30.224454: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-03 17:44:30.458839: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
label = data["labels"].unique().tolist()
label

['O',
 'B-GEO',
 'B-GPE',
 'B-PER',
 'I-GEO',
 'B-ORG',
 'I-ORG',
 'B-TIM',
 'B-ART',
 'I-ART',
 'I-PER',
 'I-GPE',
 'I-TIM',
 'B-NAT',
 'B-EVE',
 'I-EVE',
 'I-NAT']

In [16]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32

In [17]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args,use_cuda = False )


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

  return [


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/1499 [00:00<?, ?it/s]

In [20]:
result, model_outputs, preds_list = model.eval_model(test_data)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=46695.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=1460.0), HTML(value='')))




In [21]:
result

{'eval_loss': 0.17127630058676005,
 'f1_score': 0.7936661066848524,
 'precision': 0.8275694613063235,
 'recall': 0.7624312923430909}

In [22]:
prediction, model_output = model.predict(["What is the new name of Bangalore"])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Running Prediction'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [23]:
prediction

[[{'What': 'O'},
  {'is': 'O'},
  {'the': 'O'},
  {'new': 'O'},
  {'name': 'O'},
  {'of': 'O'},
  {'Bangalore': 'B-GEO'}]]