<a href="https://colab.research.google.com/github/truongthuanr/self-project/blob/dev/11_Named_Entity_Recognition/Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

%%capture
!pip3 install datasets


In [2]:

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
import os


In [3]:
from keras.models import Model
from keras.layers import Input,Embedding,TimeDistributed,\
                         Dropout,Conv1D,MaxPooling1D,\
                         Flatten,Bidirectional,LSTM,Dense,\
                         concatenate

from keras.initializers import RandomUniform

from keras.optimizers import Adam





In [4]:
# For vetorization
max_word_tokens = 24000
max_sentence_length = 30
max_word_len = 20


# Dataset

Used dataset:

https://huggingface.co/datasets/conll2003

In [5]:
conll_data = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Data preprocessing


## Character processing

In [6]:
char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)
len(char2Idx)

95

In [30]:
def character_vectorize(X):
  data_vec = []
  for sentence in X:
    # print(f"Sentence: {sentence}")
    padchar = char2Idx['PADDING']
    sentence_vec=[]
    for word in sentence:
      # print(word)
      chars = []
      if len(word) >= max_word_len:

        chars=[char2Idx[c] for c in word[:max_word_len]]
      else:
        prepad=int((max_word_len-len(word))/2)
        postpad=max_word_len-(len(word)+prepad)
        chars.extend([padchar]*prepad)
        chars.extend([char2Idx[c] for c in word])
        chars.extend([padchar]*postpad)
      sentence_vec.append(chars)

    data_vec.append(sentence_vec)
  data_vec = np.asarray(data_vec, dtype=object)
  return data_vec




# X_train_char = conll_data['train']['tokens']

# X_train_char = character_vectorize(X_train_char)
# X_train_char = pad_sequences(sequences = X_train_char,
#                              maxlen=max_sentence_length,
#                              dtype=object,
#                              padding="post",
#                              truncating="post",
#                              value=0)
# X_train_char = np.asarray(X_train_char,
#                           dtype=np.float32)

In [35]:
def char_preprocessing(char_input):
  X_chars = character_vectorize(char_input)
  X_chars = pad_sequences(sequences = X_chars,
                          maxlen=max_sentence_length,
                          dtype=object,
                          padding="post",
                          truncating="post",
                          value=0)

  X_chars = np.asarray(X_chars,
                       dtype=np.float32)

  return X_chars

In [36]:
X_train_char = conll_data['train']['tokens']
X_train_char = char_preprocessing(X_train_char)

In [39]:
X_train_char.shape

(14041, 30, 20)

## Word preprocessing

In [8]:
# X_train_word = conll_data['train']['tokens']
X_train_word = np.array([ " ".join(row) for row in conll_data['train']['tokens']])
# X_train_word

## Targer preprocessing

In [9]:
label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = {v: k for k, v in label2id.items()}


In [10]:
y_train = pad_sequences(conll_data['train']['ner_tags'],
                        dtype='int32',
                        maxlen=max_sentence_length,
                        padding='post',
                        truncating='post',
                        value=0,)

In [11]:
def encoding_ytrain(y_train):
  n=len(label2id)
  nrow=len(y_train)
  encoded = np.zeros(shape=(nrow,max_sentence_length,n))
  for iy, y in enumerate(y_train):
    senvec = np.zeros(shape=(max_sentence_length,n))
    for ichar, char in enumerate(y):
      charvec = np.zeros(n)
      charvec[char] = 1
      # print(f"{char=}")
      # print(f"{charvec=}")
      senvec[ichar]=charvec

    encoded[iy] = senvec
  return encoded

y_train = encoding_ytrain(y_train)


# Model

In [12]:
from keras.layers import TextVectorization

In [14]:
text_vectorizer = TextVectorization(max_tokens=max_word_tokens,   # how many word in the vocab lib
                                    standardize='lower_and_strip_punctuation', # process the tab
                                    split='whitespace', # split the token
                                    ngrams=None,
                                    output_mode='int', # map token to number
                                    output_sequence_length=max_sentence_length,
                                    pad_to_max_tokens=False,
                                    vocabulary=None,)
text_vectorizer.adapt(np.array([ " ".join(row) for row in conll_data['train']['tokens']]))

In [15]:
# Word level
word_input = Input(shape=(1,),dtype=tf.string,name='word_input')
# Tokenize

words_vectorize = text_vectorizer(word_input)

# Embedding
embedding = Embedding(input_dim=max_word_tokens,
                      output_dim=128,
                      embeddings_initializer="uniform",
                      input_length=None)
words = embedding(words_vectorize)


In [16]:
words

<KerasTensor: shape=(None, 30, 128) dtype=float32 (created by layer 'embedding')>

In [17]:
# Character level
character_input=Input(shape=(None,max_word_len,),
                      name='char_input',
                      dtype=tf.int32)
embed_char_out=TimeDistributed(Embedding(input_dim = len(char2Idx),
                                         output_dim=10,
                                         embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5),
                                         input_length=max_sentence_length,),
                                         name='char_embedding')(character_input)
dropout = Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3,
                                   filters=10,
                                   padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(max_word_len))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)

In [18]:
print(character_input.shape)
print(embed_char_out.shape)
print(dropout.shape)
print(conv1d_out.shape)
print(maxpool_out.shape)
print(char.shape)

(None, None, 20)
(None, None, 20, 10)
(None, None, 20, 10)
(None, None, 20, 10)
(None, None, 1, 10)
(None, None, 10)


In [19]:

combined = concatenate([words, char])
# combined = words
# Bi-LSTM
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(combined)

# # Output -> Tag Scoring -> Tag
output = TimeDistributed(Dense(len(label2id), activation='softmax'))(output)
model = Model(inputs=[word_input, character_input], outputs=[output])


In [20]:
model.compile(loss="binary_crossentropy",
              optimizer=Adam(),
              metrics=["accuracy"])

In [21]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 char_input (InputLayer)     [(None, None, 20)]           0         []                            
                                                                                                  
 char_embedding (TimeDistri  (None, None, 20, 10)         950       ['char_input[0][0]']          
 buted)                                                                                           
                                                                                                  
 dropout (Dropout)           (None, None, 20, 10)         0         ['char_embedding[0][0]']      
                                                                                                  
 time_distributed (TimeDist  (None, None, 20, 10)         310       ['dropout[0][0]']         

In [22]:
model.fit([X_train_word[:1000],X_train_char[:1000]],
          y_train[:1000],
          epochs=2,
          validation_data=([X_train_word[1000:1100],X_train_char[1000:1100]], y_train[1000:1100]))

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7d4990543d30>

# Predictions

In [60]:
X_train_word[0:2].shape

(2,)

In [67]:
def make_prediction(input_string):
  X_words = np.asarray([input_string])
  X_chars = [input_string.split()]
  X_chars = char_preprocessing(X_chars)
  predictions = model.predict([X_words,X_chars])


  predictions = np.argmax()

  return predictions

In [69]:
predictions = make_prediction("I go to Newyork University by bus tomorrow with Jane !")

X_words=array(['I go to Newyork University by bus tomorrow with Jane !'],
      dtype='<U54')
(1,)


In [74]:
np.squeeze(predictions)

array([[4.99420255e-01, 1.07654929e-01, 8.99580792e-02, 7.00183511e-02,
        3.01276352e-02, 1.19042061e-01, 2.35960502e-02, 3.57021615e-02,
        2.44805217e-02],
       [5.64083457e-01, 9.64860022e-02, 7.90900365e-02, 5.95216677e-02,
        2.67935023e-02, 1.03119478e-01, 1.97756588e-02, 3.03929765e-02,
        2.07372662e-02],
       [6.20477378e-01, 8.49094242e-02, 6.85721859e-02, 5.09579256e-02,
        2.35803016e-02, 9.13395733e-02, 1.64031610e-02, 2.60652509e-02,
        1.76947284e-02],
       [6.62624478e-01, 7.58566409e-02, 6.13858066e-02, 4.42904420e-02,
        2.10682601e-02, 8.24737772e-02, 1.40722850e-02, 2.30091978e-02,
        1.52191790e-02],
       [6.85760200e-01, 7.09251463e-02, 5.77574559e-02, 4.01702970e-02,
        1.97679494e-02, 7.80261979e-02, 1.25833303e-02, 2.12454051e-02,
        1.37639800e-02],
       [6.74552619e-01, 7.21246973e-02, 6.13345727e-02, 4.10474613e-02,
        2.06942502e-02, 8.20617676e-02, 1.26036322e-02, 2.17717309e-02,
        1.3

# References

https://keras.io/examples/nlp/ner_transformers/