# Part 1. Sequence Tagging: NER

In [1]:
import pandas as pd
import os
import gensim.downloader
from gensim.models import Word2Vec
import tensorflow as tf
import numpy as np

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dropout


from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot import PlotLossesKeras


## 1.1 Word Embedding

In [2]:
w2v = gensim.downloader.load("word2vec-google-news-300")

### Qn 1.1

In [3]:
words = ["student", "Apple", "apple"]
print("-----------------------------------------------------------------------")
print("Word\t\tMost similar word\tCosine similarity")
print("-----------------------------------------------------------------------")
for word in words:
    most_similar = w2v.most_similar(positive=[word])
    print(f"{word}\t\t{most_similar[0][0]}  \t\t{most_similar[0][1]}")
print("-----------------------------------------------------------------------")

-----------------------------------------------------------------------
Word		Most similar word	Cosine similarity
-----------------------------------------------------------------------
student		students  		0.7294865846633911
Apple		Apple_AAPL  		0.7456987500190735
apple		apples  		0.720359742641449
-----------------------------------------------------------------------


## 1.2 Data

In [4]:
CoNLL2003_dir = '../Datasets/CoNLL2003_dataset'
train_dir = f'{CoNLL2003_dir}/eng.train'
dev_dir =  f'{CoNLL2003_dir}/eng.testa'
test_dir =  f'{CoNLL2003_dir}/eng.testb'

In [51]:
def import_content(path):
    try:
        with open(path, 'r') as file:
            content = file.readlines()
        file.close()
    except Exception as e:
        content = None
        print(e)
    
    return content

def print_items(item):
    for s in item: print(s)

In [52]:
train_content = import_content(train_dir)
dev_content = import_content(dev_dir)
test_content = import_content(test_dir)

### Split data by sentences

In [53]:
def split_sentences(content):
    split_data = [c.split(' ') for c in content] if content != None else []
    sentences = []
    sentence = []
    words = []

    for line in split_data:
        # if end of a sentence
        if line == ['\n']:
            sentences.append(sentence)
            sentence = []
        else:
            s_text  = line[0]
            s_tag = line[-1].replace('\n','')

            sentence.append([s_text, s_tag]) 
            words.append([s_text, s_tag])
    
    sentences.append(sentence) # last item in content not new line so must add previous sentence manually after loop           

    return sentences, words

In [54]:
def split_text_tag(sentences):
    text = []
    tag = []
    combined = []
    sentence_count = 1

    for s in sentences:
        for w in s:
            w_text  = w[0]
            w_tag = w[-1].replace('\n','')

            text.append(w_text)
            tag.append(w_tag)        
            combined.append({
                'sentence': sentence_count,
                'text' : w_text,
                'tag' : w_tag
            })   
        sentence_count+=1       
    return text, tag, combined

In [55]:
train_sentences, train_words = split_sentences(train_content)
dev_sentences, dev_words = split_sentences(dev_content)
test_sentences, test_words = split_sentences(test_content)

train_text, train_tag, train_combined = split_text_tag(train_sentences)
dev_text, dev_tag, dev_combined = split_text_tag(dev_sentences)
test_text, test_tag, test_combined = split_text_tag(test_sentences)

In [56]:
train_voc = np.unique(np.array(train_text))
dev_voc = np.unique(np.array(dev_text))


tag_set = np.unique(np.array(train_tag))

### Qn 1.2 (a)

#### Describe the size (number of sentences) of the training, development and test file for CoNLL2003.

In [57]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(dev_sentences))
print("Number of sentences (test):", len(test_sentences))

Number of sentences (training): 14987
Number of sentences (dev): 3466
Number of sentences (test): 3684


#### Specify the complete set of all possible word labels based on the tagging scheme (IO, BIO, etc.) you chose

In [58]:
print("Tag set (BIO):", tag_set)

Tag set (BIO): ['B-LOC' 'B-MISC' 'B-ORG' 'I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


### Qn 1.2 (b)

#### Choose an example sentence from the training set of CoNLL2003 that has at least two named entities with more than one word.

In [59]:
def get_multiple_ne_sentence(sentences):
    for sentence in sentences:
        ne_count = 0
        for word_info in sentence:
            if "B-" in word_info[-1]:
                ne_count+=1
        if ne_count == 2:
            return sentence
    return None        

In [60]:
sentence = get_multiple_ne_sentence(train_sentences)
sentence

[['Swiss', 'I-MISC'],
 ['Grand', 'B-MISC'],
 ['Prix', 'I-MISC'],
 ['World', 'B-MISC'],
 ['Cup', 'I-MISC'],
 ['cycling', 'O'],
 ['race', 'O'],
 ['on', 'O'],
 ['Sunday', 'O'],
 [':', 'O']]

#### Explain how to form complete named entities from the label for each word, and list all the named entities in this sentence.

In [61]:
def get_named_entities(sentence):
    inside_tags = ['I-ORG', 'I-LOC', 'I-PER', 'I-MISC'] # Tags that require multiple words to form an entity
    begin_tags = ['B-LOC', 'B-ORG', 'B-MISC'] # Tags that are single word entities
    outside_tags = ['O']
    entities = [] # all entities gotten from search
    entity = [] # word group of current entity if any group tags encountered
    
    for c in sentence:
        if (c['tag'] in begin_tags or c['tag'] in outside_tags or c['tag'] == '\n') and len(entity) != 0:
            entities.append(' '.join(entity))
            entity = []
        if c['tag'] in begin_tags or c['tag'] in inside_tags: 
            entity.append(c['text'])

    return entities

In [62]:
_,_,sentence_text_tag = split_text_tag([sentence])
print("Complete named entities in the sentence:", get_named_entities(sentence_text_tag))

Complete named entities in the sentence: ['Swiss', 'Grand Prix', 'World Cup']


#### Tag-text dataset

In [63]:
train_df = pd.DataFrame(train_combined)
dev_df = pd.DataFrame(dev_combined)
test_df = pd.DataFrame(test_combined)

# path = '../Datasets/Processed/'
# file_name = 'CoNLL2003_processed'
# # Export DataFrame to a CSV file
# df.to_csv(f'{path}{file_name}.csv', index=False)

## 1.3 Model

### Create vocabulary index

In [64]:
#Load w2v models for train and dev

path = '../Pretrained_Models/'

train_w2v = Word2Vec.load('../Pretrained_Models/CONLL2003_pretrain.model')

train_pretrained_weights = train_w2v.wv.vectors
train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

word2idx = train_w2v.wv.key_to_index
word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1
voc = train_voc
voc = np.append(voc,'<UNK>')
voc = np.append(voc,'<PAD>')

tag2idx = {k: v for v, k in enumerate(tag_set)}

### Gensim pre-trained word embeddings

In [65]:
w2v_word2idx = dict(w2v.key_to_index)
w2v_voc = w2v.index_to_key
w2v_word2idx['<UNK>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1
w2v_word2idx['<PAD>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1

### Create embedding matrix

In [66]:
embeddings_index = {}
for v in w2v_voc:
  embeddings_index[v] =  w2v[v] 


In [67]:
num_tokens = len(voc) + 2
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word2idx.items():    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector[0:50]
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 17724 words (5902 misses)


## LSTM

In [68]:
def get_x_sequence(sentences):
  sequence = []
  sent_seq = []
  for s in sentences:
    for w in s:
      if w[0] in word2idx.keys():
        sent_seq.append(word2idx[w[0]])
      else:
        sent_seq.append(word2idx['<UNK>'])
    sequence.append(sent_seq)
    sent_seq = []
  
  return sequence

In [69]:
x_train = get_x_sequence(train_sentences)
#x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post", value=len(word2idx)-1)
x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post")

y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post", value=tag2idx['O'])
#y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post")

x_dev = get_x_sequence(dev_sentences)
#x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post", value=len(word2idx)-1)
x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post")

y_dev = [[tag2idx[w[1]] for w in s] for s in dev_sentences]
y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post", value=tag2idx['O'])
#y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post")

x_test = get_x_sequence(test_sentences)
#x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post", value=len(word2idx)-1)
x_test = pad_sequences(maxlen=embedding_dim, sequences=x_test, padding="post")

y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]
y_test = pad_sequences(maxlen=embedding_dim, sequences=y_test, padding="post", value=tag2idx['O'])
#y_test = pad_sequences(maxlen=embedding_dim, sequences=y_test, padding="post")

y_train = to_categorical(y_train)
y_dev = to_categorical(y_dev)
y_test = to_categorical(y_test)

In [70]:
num_classes = len(tag_set)
sequence_length = 50
output_shape=(sequence_length,num_classes)

model = keras.Sequential()
model.add(InputLayer(embedding_dim))
#model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim,  embeddings_initializer=keras.initializers.Constant(embedding_matrix),
#    trainable=False,))
#model.add(SpatialDropout1D(0.01))
#model.add(Bidirectional(LSTM(units=embedding_dim, return_sequences=True, recurrent_dropout=0.1)))
model.add(Embedding(input_dim=num_tokens, output_dim=embedding_dim, input_length = sequence_length, trainable=False,))
model.add(LSTM(units=sequence_length))
model.add(Dense(units=256, activation= "relu", kernel_regularizer=regularizers.L1L2(l1=0.025, l2=0.025)))
model.add(Dropout(0.01))  
#model.add(Dense(units=256, activation= "sigmoid", kernel_regularizer=regularizers.L1L2(l1=0.0025, l2=0.0025)))
#model.add(Dropout(0.01))  
model.add(Dense(sequence_length * num_classes, activation='softmax'))
model.add(tf.keras.layers.Reshape(output_shape))

model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])
early_stopping = EarlyStopping(patience=10)
model.summary()


Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 50, 50)            1181400   
                                                                 
 lstm_7 (LSTM)               (None, 50)                20200     
                                                                 
 dense_10 (Dense)            (None, 256)               13056     
                                                                 
 dropout_7 (Dropout)         (None, 256)               0         
                                                                 
 dense_11 (Dense)            (None, 400)               102800    
                                                                 
 reshape_3 (Reshape)         (None, 50, 8)             0         
                                                                 
Total params: 1317456 (5.03 MB)
Trainable params: 1360

In [71]:
num_epochs = 1000
batch_size = 1000

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, shuffle=True, validation_data=(x_dev, y_dev),
         callbacks = early_stopping, workers = 4)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000


<keras.src.callbacks.History at 0x1308bff5950>

In [72]:
predicted_labels = model.predict(x_test)
print(predicted_labels)

[[[4.74817881e-07 5.43705880e-07 4.42381435e-07 ... 1.65823312e-03
   8.44051945e-04 6.20870385e-03]
  [3.74827687e-06 4.34062804e-06 5.36041227e-07 ... 7.28061073e-04
   9.50305432e-04 6.44530309e-03]
  [1.32989055e-06 5.43600152e-07 5.83198869e-07 ... 6.76089840e-04
   1.02641259e-03 9.10586957e-03]
  ...
  [1.41236126e-06 1.37939014e-06 1.27317981e-06 ... 1.16482151e-05
   1.69183404e-05 3.88799310e-02]
  [1.37621294e-06 1.54346822e-06 1.34178413e-06 ... 7.47707236e-06
   1.35809605e-05 4.80026565e-02]
  [1.83785505e-06 1.54996440e-06 1.68671818e-06 ... 1.77996026e-06
   1.41832459e-06 9.29219872e-02]]

 [[4.74817881e-07 5.43705880e-07 4.42381435e-07 ... 1.65823312e-03
   8.44051945e-04 6.20870385e-03]
  [3.74827687e-06 4.34062804e-06 5.36041227e-07 ... 7.28061073e-04
   9.50305432e-04 6.44530309e-03]
  [1.32989055e-06 5.43600152e-07 5.83198869e-07 ... 6.76089840e-04
   1.02641259e-03 9.10586957e-03]
  ...
  [1.41236126e-06 1.37939014e-06 1.27317981e-06 ... 1.16482151e-05
   1.69183

In [73]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

  6/116 [>.............................] - ETA: 1s - loss: 0.2554 - accuracy: 0.9474

Test Loss: 0.23149216175079346, Test Accuracy: 0.9565255045890808


In [75]:

for i in range(len(x_test)):
  pred_tags = []
  for x, y, z in zip(x_test[i], y_test[i], predicted_labels[i]):
    word = list(word2idx.keys())[list(word2idx.values()).index(x)]
    true_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.where(y == 1)[0][0])]
    pred_tag = list(tag2idx.keys())[list(tag2idx.values()).index(np.where(predicted_labels[0][0] == max(predicted_labels[0][0]))[0][0])]
    print(word, true_tag, pred_tag)
    pred_tags.append(pred_tag)
  # print(pred_tags)
  print()

SOCCER O O
- O O
<UNK> I-LOC O
<UNK> O O
<UNK> O O
WIN O O
, O O
CHINA I-PER O
IN O O
<UNK> O O
DEFEAT O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O

<UNK> I-PER O
<UNK> I-PER O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O

<UNK> I-LOC O
, O O
United I-LOC O
Arab I-LOC O
Emirates I-LOC O
<UNK> O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O
. O O

J

KeyboardInterrupt: 