<a href="https://colab.research.google.com/github/truongthuanr/self-project/blob/dev/11_Named_Entity_Recognition/Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

%%capture
!pip3 install datasets


In [2]:

import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
import os


In [3]:
from keras.models import Model
from keras.layers import Input,Embedding,TimeDistributed,\
                         Dropout,Conv1D,MaxPooling1D,\
                         Flatten,Bidirectional,LSTM,Dense,\
                         concatenate

from keras.initializers import RandomUniform

from keras.optimizers import Adam





In [4]:
# For vetorization
max_word_tokens = 24000
max_sentence_length = 30
max_word_len = 20


# Dataset

Used dataset:

https://huggingface.co/datasets/conll2003

In [5]:
conll_data = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

# Data preprocessing


## Character processing

In [6]:
char2Idx = {"PADDING":0, "UNKNOWN":1}
for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|":
    char2Idx[c] = len(char2Idx)
len(char2Idx)

95

In [7]:
def character_vectorize(X):
  data_vec = []
  for sentence in X:
    # print(f"Sentence: {sentence}")
    padchar = char2Idx['PADDING']
    sentence_vec=[]
    for word in sentence:
      # print(word)
      chars = []
      if len(word) >= max_word_len:

        chars=[char2Idx[c] for c in word[:max_word_len]]
      else:
        prepad=int((max_word_len-len(word))/2)
        postpad=max_word_len-(len(word)+prepad)
        chars.extend([padchar]*prepad)
        chars.extend([char2Idx[c] for c in word])
        chars.extend([padchar]*postpad)
      sentence_vec.append(chars)

    data_vec.append(sentence_vec)
  data_vec = np.asarray(data_vec, dtype=object)
  return data_vec




# X_train_char = conll_data['train']['tokens']

# X_train_char = character_vectorize(X_train_char)
# X_train_char = pad_sequences(sequences = X_train_char,
#                              maxlen=max_sentence_length,
#                              dtype=object,
#                              padding="post",
#                              truncating="post",
#                              value=0)
# X_train_char = np.asarray(X_train_char,
#                           dtype=np.float32)

In [8]:
def char_preprocessing(char_input):
  '''
  input:: char_input: list of words
  output: vecterized array of character
  with shape(sentences, max_sentence_len, max_word_len)
  '''
  X_chars = character_vectorize(char_input)
  X_chars = pad_sequences(sequences = X_chars,
                          maxlen=max_sentence_length,
                          dtype=object,
                          padding="post",
                          truncating="post",
                          value=0)

  X_chars = np.asarray(X_chars,
                       dtype=np.float32)

  return X_chars

In [9]:
X_train_char = conll_data['train']['tokens']
X_train_char = char_preprocessing(X_train_char)

## Word preprocessing

In [10]:
# X_train_word = conll_data['train']['tokens']
X_train_word = np.array([ " ".join(row) for row in conll_data['train']['tokens']])
# X_train_word

## Caps preprocessing

In [11]:
case2id = {'allcaps':0,
           'upperinitial':1,
           'lower':2,
           'mixedcaps':3,
           'noinfo':4}
id2case = {v:k for k,v in case2id.items()}

In [12]:
def case_vectorize(input):
  '''
  input: array of sentences, sentencs is list of word
  '''
  case_vec = []

  for sentence in input:
    sen_case_type = []
    for word in sentence:
      temp = [0]*len(case2id)
      # if word is Title
      if word.istitle():
        temp[case2id['upperinitial']] =1
        sen_case_type.append(temp)

      # if uper, lower, mixed or else
      else:
        if word.isupper():
          temp[case2id['allcaps']] = 1
          sen_case_type.append(temp)
        elif word.islower():
          temp[case2id['lower']]=1
          sen_case_type.append(temp)
        else:
          if word.lower().islower():
            temp[case2id['mixedcaps']]=1
            sen_case_type.append(temp)
          else:
            temp[case2id['noinfo']]=1
            sen_case_type.append(temp)


    case_vec.append(sen_case_type)
  return case_vec



In [13]:
def case_preprocesing(X_input):
  X_case = case_vectorize(X_input)

  X_case = pad_sequences(sequences = X_case,
                        maxlen=max_sentence_length,
                        dtype=object,
                        padding="post",
                        truncating="post",
                        value=case2id['noinfo'])

  X_case = np.asarray(X_case,
                      dtype=np.float32)

  return X_case

X_train_case = case_preprocesing(conll_data['train']['tokens'])

## Targer preprocessing

In [14]:
label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = {v: k for k, v in label2id.items()}


In [15]:
y_train = pad_sequences(conll_data['train']['ner_tags'],
                        dtype='int32',
                        maxlen=max_sentence_length,
                        padding='post',
                        truncating='post',
                        value=0,)

In [16]:
def encoding_ytrain(y_train):
  n=len(label2id)
  nrow=len(y_train)
  encoded = np.zeros(shape=(nrow,max_sentence_length,n))
  for iy, y in enumerate(y_train):
    senvec = np.zeros(shape=(max_sentence_length,n))
    for ichar, char in enumerate(y):
      charvec = np.zeros(n)
      charvec[char] = 1
      # print(f"{char=}")
      # print(f"{charvec=}")
      senvec[ichar]=charvec

    encoded[iy] = senvec
  return encoded

y_train = encoding_ytrain(y_train)


# Model

In [17]:
from keras.layers import TextVectorization

In [18]:
text_vectorizer = TextVectorization(max_tokens=max_word_tokens,   # how many word in the vocab lib
                                    standardize='lower_and_strip_punctuation', # process the tab
                                    split='whitespace', # split the token
                                    ngrams=None,
                                    output_mode='int', # map token to number
                                    output_sequence_length=max_sentence_length,
                                    pad_to_max_tokens=False,
                                    vocabulary=None,)
text_vectorizer.adapt(np.array([ " ".join(row) for row in conll_data['train']['tokens']]))

In [19]:
# Word level
word_input = Input(shape=(1,),dtype=tf.string,name='word_input')
# Tokenize

words_vectorize = text_vectorizer(word_input)

# Embedding
embedding = Embedding(input_dim=max_word_tokens,
                      output_dim=128,
                      embeddings_initializer="uniform",
                      input_length=None)
words = embedding(words_vectorize)


In [20]:
# Character level
character_input=Input(shape=(None,max_word_len,),
                      name='char_input',
                      dtype=tf.int32)
embed_char_out=TimeDistributed(Embedding(input_dim = len(char2Idx),
                                         output_dim=10,
                                         embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5),
                                         input_length=max_sentence_length,),
                                         name='char_embedding')(character_input)
dropout = Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3,
                                   filters=10,
                                   padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(max_word_len))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)

In [21]:
char

<KerasTensor: shape=(None, None, 10) dtype=float32 (created by layer 'dropout_1')>

In [22]:
## Caps feature
casing_input = Input(shape=(max_sentence_length,), dtype='int32', name='casing_input')
casing_embed = Embedding(input_dim=len(case2id),
                        output_dim=len(case2id),
                        input_length=max_sentence_length,
                        trainable=False)

casing = casing_embed(casing_input)

In [23]:
casing

<KerasTensor: shape=(None, 30, 5) dtype=float32 (created by layer 'embedding_2')>

In [24]:

combined = concatenate([words, char, casing])
# combined = words
# Bi-LSTM
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(combined)

# # Output -> Tag Scoring -> Tag
output = TimeDistributed(Dense(len(label2id), activation='softmax'))(output)
model = Model(inputs=[word_input, character_input, casing], outputs=[output])




In [25]:
combined

<KerasTensor: shape=(None, 30, 143) dtype=float32 (created by layer 'concatenate')>

In [26]:
model.compile(loss="binary_crossentropy",
              optimizer=Adam(),
              metrics=["accuracy"])

In [27]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 char_input (InputLayer)     [(None, None, 20)]           0         []                            
                                                                                                  
 char_embedding (TimeDistri  (None, None, 20, 10)         950       ['char_input[0][0]']          
 buted)                                                                                           
                                                                                                  
 dropout (Dropout)           (None, None, 20, 10)         0         ['char_embedding[1][0]']      
                                                                                                  
 time_distributed (TimeDist  (None, None, 20, 10)         310       ['dropout[1][0]']         

In [29]:
model.fit([X_train_word[1000:],X_train_char[1000:], X_train_case[1000:]],
          y_train[1000:],
          epochs=50,
          batch_size=512,
          validation_data=([X_train_word[:1000],X_train_char[:1000],X_train_case[:1000]], y_train[:1000]))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f672152ba90>

# Predictions

In [None]:
X_train_char.shape

In [None]:
X_train_case.shape

In [30]:
def make_prediction(input_string):
  X_words = np.asarray([input_string])
  X_input_list = [input_string.split()]
  X_chars = char_preprocessing(X_input_list)
  X_case = case_preprocesing(X_input_list)
  predictions = model.predict([X_words,X_chars,X_case])


  # predictions = np.argmax()

  return predictions

In [31]:
predictions = make_prediction("I go to USA University by Germany tomorrow with Jane !")



In [32]:
predictions = make_prediction("Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .")




In [40]:
temp = np.squeeze(np.argmax(predictions,axis=2))
temp = [id2label[word] for word in temp]

In [41]:
temp

['B-LOC',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'B-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

# References

https://keras.io/examples/nlp/ner_transformers/