# NAMED ENTITY RECOGNITION:

1. The named entities are pre-defined categories chosen according to the use case such as names of people, organizations, places, codes, time notations, monetary values, etc.

1. NER aims to assign a class to each token (usually a single word) in a sequence. Because of this, NER is also referred to as token classification.

In [1]:
!pip3 install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.61.13-py3-none-any.whl (221 kB)
[?25l[K     |█▌                              | 10 kB 36.4 MB/s eta 0:00:01[K     |███                             | 20 kB 19.7 MB/s eta 0:00:01[K     |████▍                           | 30 kB 16.2 MB/s eta 0:00:01[K     |██████                          | 40 kB 13.6 MB/s eta 0:00:01[K     |███████▍                        | 51 kB 7.7 MB/s eta 0:00:01[K     |████████▉                       | 61 kB 7.5 MB/s eta 0:00:01[K     |██████████▍                     | 71 kB 7.9 MB/s eta 0:00:01[K     |███████████▉                    | 81 kB 8.8 MB/s eta 0:00:01[K     |█████████████▎                  | 92 kB 9.2 MB/s eta 0:00:01[K     |██████████████▉                 | 102 kB 7.4 MB/s eta 0:00:01[K     |████████████████▎               | 112 kB 7.4 MB/s eta 0:00:01[K     |█████████████████▊              | 122 kB 7.4 MB/s eta 0:00:01[K     |███████████████████▎            | 133 kB

In [2]:
import pandas as pd
data = pd.read_csv("symptoms_ner.csv")

In [3]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,Sentence_Num,Word,Tag
0,0,Sentence: 0,i,B-o
1,1,Sentence: 0,did,I-o
2,2,Sentence: 0,not,I-o
3,3,Sentence: 0,meet,L-o
4,4,Sentence: 0,jin,U-first-name
5,5,Sentence: 0,hy,U-last-name
6,6,Sentence: 0,yesterday,B-o
7,7,Sentence: 0,because,I-o
8,8,Sentence: 0,i,I-o
9,9,Sentence: 0,have,L-o


In [4]:
data =data.fillna(method ="ffill")

In [5]:
data.head(30)

Unnamed: 0.1,Unnamed: 0,Sentence_Num,Word,Tag
0,0,Sentence: 0,i,B-o
1,1,Sentence: 0,did,I-o
2,2,Sentence: 0,not,I-o
3,3,Sentence: 0,meet,L-o
4,4,Sentence: 0,jin,U-first-name
5,5,Sentence: 0,hy,U-last-name
6,6,Sentence: 0,yesterday,B-o
7,7,Sentence: 0,because,I-o
8,8,Sentence: 0,i,I-o
9,9,Sentence: 0,have,L-o


In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
data["Sentence_Num"] = LabelEncoder().fit_transform(data["Sentence_Num"] )

In [8]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,Sentence_Num,Word,Tag
0,0,0,i,B-o
1,1,0,did,I-o
2,2,0,not,I-o
3,3,0,meet,L-o
4,4,0,jin,U-first-name
5,5,0,hy,U-last-name
6,6,0,yesterday,B-o
7,7,0,because,I-o
8,8,0,i,I-o
9,9,0,have,L-o


In [9]:
data.rename(columns={"Sentence_Num":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)

In [10]:
data["labels"] = data["labels"].str.upper()

In [11]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [13]:
print (x_test)

       sentence_id      words
37006         2525        and
19313          809       have
6085          3477   vomiting
23029         1168        did
25692         1422   vomiting
...            ...        ...
38914         2720      might
21305         1005        and
6747          3542       have
37195         2545       ache
27930         1641  yesterday

[8971 rows x 2 columns]


In [14]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [15]:
train_data

Unnamed: 0,sentence_id,words,labels
10573,3917,covid,L-O
5261,3398,the,I-O
25461,1399,meet,I-O
27562,1605,since,B-O
4388,3310,chest,B-SYMPTOM
...,...,...,...
4849,3360,rash,L-SYMPTOM
41578,2985,i,I-O
5145,3386,have,I-O
2429,1313,because,I-O


# Model Training


In [16]:
from simpletransformers.ner import NERModel,NERArgs

In [17]:
label = data["labels"].unique().tolist()
label

['B-O',
 'I-O',
 'L-O',
 'U-FIRST-NAME',
 'U-LAST-NAME',
 'U-SYMPTOM',
 'B-SYMPTOM',
 'I-SYMPTOM',
 'L-SYMPTOM',
 'U-O',
 'U-TIMEFRAME',
 'B-TIMEFRAME',
 'L-TIMEFRAME',
 'U-EXCLAMATION',
 'B-FIRST-NAME',
 'L-FIRST-NAME',
 'B-EXCLAMATION',
 'L-EXCLAMATION',
 'I-FIRST-NAME',
 'U-PEOPLE']

In [18]:
args = NERArgs()
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32


In [19]:
#Use the model you want by un-commenting that line

# model = NERModel('bert', 'bert-base-uncased',labels=label,args =args)
# model = NERModel('xlnet', 'xlnet-base-cased',labels=label,args =args)
# model = NERModel('roberta', 'roberta-base',labels=label,args =args)
# model = NERModel('distilbert', 'distilbert-base-uncased',labels=label,args =args)
model = NERModel('albert', 'albert-base-v1',labels=label,args =args)
# model = NERModel('electra', 'google/electra-small-discriminator',labels=label,args =args)


Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForTokenClassification: ['predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [20]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/125 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


Running Epoch 1 of 3:   0%|          | 0/125 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/125 [00:00<?, ?it/s]

(375, 0.25780592959622545)

In [21]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/110 [00:00<?, ?it/s]



In [22]:
result

{'eval_loss': 0.39813551184805956,
 'f1_score': 0.9101234567901236,
 'precision': 0.9071369975389664,
 'recall': 0.9131296449215525}

In [33]:
prediction1, model_output1 = model.predict(["I have come down with cold and fatigue since yesterday after meeting Jung Shen"])
prediction2, model_output2 = model.predict(["I have come down with pneumonia and fatigue since yesterday after meeting Jung Shen"])
prediction3, model_output3 = model.predict(["I have come down with building and fatigue since yesterday after meeting Jung Shen"])
prediction4, model_output4 = model.predict(["I have come down with building and food since yesterday after meeting Jung Shen"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [34]:
print (prediction1)
print (prediction2)
print (prediction3)
print (prediction4)

[[{'I': 'I-FIRST-NAME'}, {'have': 'I-O'}, {'come': 'I-O'}, {'down': 'I-O'}, {'with': 'I-O'}, {'cold': 'U-SYMPTOM'}, {'and': 'U-O'}, {'fatigue': 'U-SYMPTOM'}, {'since': 'U-O'}, {'yesterday': 'U-TIMEFRAME'}, {'after': 'U-O'}, {'meeting': 'I-O'}, {'Jung': 'U-LAST-NAME'}, {'Shen': 'L-FIRST-NAME'}]]
[[{'I': 'I-FIRST-NAME'}, {'have': 'I-O'}, {'come': 'I-O'}, {'down': 'I-O'}, {'with': 'I-O'}, {'pneumonia': 'U-SYMPTOM'}, {'and': 'U-O'}, {'fatigue': 'U-SYMPTOM'}, {'since': 'U-O'}, {'yesterday': 'U-TIMEFRAME'}, {'after': 'U-O'}, {'meeting': 'I-O'}, {'Jung': 'U-LAST-NAME'}, {'Shen': 'B-FIRST-NAME'}]]
[[{'I': 'I-FIRST-NAME'}, {'have': 'I-O'}, {'come': 'I-O'}, {'down': 'I-O'}, {'with': 'I-O'}, {'building': 'L-O'}, {'and': 'U-O'}, {'fatigue': 'U-SYMPTOM'}, {'since': 'U-O'}, {'yesterday': 'U-TIMEFRAME'}, {'after': 'U-O'}, {'meeting': 'I-O'}, {'Jung': 'L-FIRST-NAME'}, {'Shen': 'B-FIRST-NAME'}]]
[[{'I': 'I-FIRST-NAME'}, {'have': 'I-O'}, {'come': 'I-O'}, {'down': 'I-O'}, {'with': 'I-O'}, {'building': 'L