In [29]:
# !git clone https://github.com/taslimamindia/NERC.git

# Importation

In [30]:
import pandas as pd

import numpy as np

from nltk import word_tokenize, sent_tokenize, download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

import tensorflow as tf

from keras.layers import Embedding
from keras.utils import to_categorical, pad_sequences

In [31]:
# download('wordnet') # for google colab

# Class define form data.

In [32]:
class Data(object):
    unique_words = {"<PAD>":0}
    unique_ner_tags = {}
    MAX_LENGTH = 200
    VOCAB_SIZE = 100
    def __init__(self):
        self.sentences = []
        self.sentences_num = None
        self.ner_tags = []
        self.ner_tags_num = None
        self.chunk_tags = []
        self.pos_tags = []
        self.x, self.y = None, None
    def word2vec(self, vector_size=100):
        VOCAB_SIZE = vector_size
        word2vec_model = Word2Vec(self.sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
        return word2vec_model   
    def word2idx(self, word:str):
        return Data.unique_words.get(word, None)
    def idx2word(self, index:int):
        for word, value in Data.unique_words.items():
            if index is value: return word
        return None    
    def tag2idx(self, tag):
        return Data.unique_ner_tags.get(tag, None)
    def idx2tag(self, index):
        for tag, value in Data.unique_ner_tags.items():
            if index == value: return tag
        return None
    def unicity(self):
        unique_sent, unique_tag = set(), set()
        [unique_tag.update(tags) for tags in self.ner_tags_num]
        [unique_sent.update(tags) for tags in self.sentences_num]
        max_tags = len(Data.unique_ner_tags)
        max_words = len(Data.unique_words)
        for word in list(unique_sent):
            if Data.unique_words.get(word, None) == None:
                Data.unique_words[word] = max_words
                max_words += 1
        for tag in list(unique_tag):
            if Data.unique_ner_tags.get(tag, None) == None:
                Data.unique_ner_tags[tag] = max_tags
                max_tags += 1


# Loading data

In [33]:
class Loading():
    def __init__(self, data: Data, file):
        self.data = data
        self.load_sentences(file)
        print("Loading successfully")
    def load_sentences(self, filepath):
        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
        with open(filepath, 'r') as f:
            for line in f.readlines():
                if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                    if len(tokens) > 0:
                        self.data.sentences.append(tokens)
                        self.data.pos_tags.append(pos_tags)
                        self.data.chunk_tags.append(chunk_tags)
                        self.data.ner_tags.append(ner_tags)
                        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
                else:
                    l = line.split(' ')
                    tokens.append(l[0])
                    pos_tags.append(l[1])
                    chunk_tags.append(l[2])
                    ner_tags.append(l[3].strip('\n'))

# Preprocessing

In [34]:
class Preprocessing():
    def __init__(self, data:Data, text=None, lang="english"):
        self.data = data
        self.text = text
        self.lang = lang
        if text == None:
          self.data.sentences_num = self.data.sentences
          self.data.ner_tags_num = self.data.ner_tags
    def tokenize(self):
        if self.text != None:
            sentenses = [word_tokenize(sentence, language=self.lang) for sentence in sent_tokenize(self.text, language=self.lang)]
            self.data.sentences = [[token for token in sentence if token not in stopwords.words(self.lang)] for sentence in sentenses]
            self.data.sentences_num = self.data.sentences
    def lowercasing(self):
        self.data.sentences_num = [[word.lower() for word in sentence] for sentence in self.data.sentences_num]
    def lemmatize(self):
        lemmatizer = WordNetLemmatizer()
        self.data.sentences_num = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in self.data.sentences_num]

# Vectorization

In [35]:
class Vectorization():
    def __init__(self, data:Data):
        self.data = data
    def word2vec(self, min_count=1, window=5):
        word2vec_model = Word2Vec(self.data.sentences_num, min_count=min_count, vector_size=Data.VOCAB_SIZE, window=window)
        self.data.sentences_num = [[word2vec_model.wv[word] for word in sentence] for sentence in self.data.sentences_num]
    def padding_x(self):
        if len(self.data.sentences_num) > 0:
            self.data.x = pad_sequences(
                sequences=self.data.sentences_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=np.zeros((Data.VOCAB_SIZE,), dtype="float32")
            )
    def vectorized_x(self):
        self.word2vec()
        self.padding_x()
        
    def padding_y(self):
        if len(self.data.ner_tags_num) > 0:
            self.data.ner_tags_num = [[Data.unique_ner_tags.get(tag) for tag in tags] for tags in self.data.ner_tags_num]            
            self.data.ner_tags_num = pad_sequences(
                sequences=self.data.ner_tags_num, 
                maxlen=self.data.MAX_LENGTH,
                padding="post", 
                dtype="str",
                value=Data.unique_ner_tags.get("O")
            )
    def word2num(self):
        x, y = self.data.ner_tags_num.shape
        NUM_CLASSES = len(Data.unique_ner_tags)
        self.data.y = np.zeros((x, y, NUM_CLASSES))
        for i in range(x):
            for j in range(y):
                self.data.y[i][j] = to_categorical(self.data.ner_tags_num[i][j], num_classes=NUM_CLASSES)
    def vectorized_y(self):
        self.padding_y()
        self.word2num()        

# Main

## Pretraining for CONLL2003

In [36]:
def pretraining_CoNLL3(path: str):
    data = Data()
    # base_file = "../Data/conll2003_english/"
    base_file = "/content/NERC/Data/conll2003_english/"
    Loading(data = data, file=base_file + path)
    return data

def load(data:Data):
    preprocessing = Preprocessing(data=data)
    preprocessing.lowercasing()
    preprocessing.lemmatize()
    data.unicity()
    
def vectorize(data:Data):
    vector = Vectorization(data=data)
    vector.vectorized_x()
    vector.vectorized_y()

## Define Dataset

In [37]:
def loadData():
    train = pretraining_CoNLL3("train.txt")
    test = pretraining_CoNLL3("test.txt")
    valid = pretraining_CoNLL3("valid.txt")
    
    load(train)
    load(test)
    load(valid)
    
    vectorize(train)
    vectorize(test)
    vectorize(valid)
    
    print(train.sentences[0])
    print(train.ner_tags[0], "\n")
    print("X_train", train.x.shape)
    print("y_train", train.y.shape, "\n")
    print(type(train.x))
    
    print("X_test", test.x.shape)
    print("y_test", test.y.shape)
    
    print("X_valid", valid.x.shape)
    print("y_valid", valid.y.shape)
    return train, test, valid

train, test, valid = loadData()

Loading successfully
Loading successfully
Loading successfully
['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] 

X_train (14041, 200, 100)
y_train (14041, 200, 9) 

<class 'numpy.ndarray'>
X_test (3453, 200, 100)
y_test (3453, 200, 9)
X_valid (3250, 200, 100)
y_valid (3250, 200, 9)


## New input text

In [38]:
# test_text = Data()

# preprocessing = Preprocessing(data = test_text, text = "Obama is the president of the United States. I am from Guinea, nice to meet you.")
# preprocessing.tokenize()
# preprocessing.lowercasing()
# preprocessing.lemmatize()
# print(test_text.sentences)

# vector = Vectorization(test_text)
# vector.vectorized_x()
# print(test_text.x.shape)

# Trainning

### CNN model

In [39]:
NUM_WORDS = len(Data.unique_words)
NUM_CLASSES = len(Data.unique_ner_tags)
MAX_LENGTH = Data.MAX_LENGTH
# Hyperparameters
EMBEDDING_DIM = 100
NUM_FILTERS = 256
KERNEL_SIZE = 3
DROPOUT_RATE = 0.5
BATCH_SIZE = 32
EPOCHS = 10
print(MAX_LENGTH, NUM_CLASSES, NUM_WORDS)

200 9 25128


In [40]:
# # from sklearn.feature_extraction.text import TfidfVectorizer
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D


# # Build CNN model
# model = Sequential()
# model.add(Conv1D(64, KERNEL_SIZE, activation='relu', input_shape=(MAX_LENGTH, EMBEDDING_DIM), padding='same'))
# # model.add(MaxPooling1D(2, padding='same'))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Conv1D(32, KERNEL_SIZE, activation='relu', padding='same'))
# # model.add(MaxPooling1D(2))
# # model.add(Dropout(DROPOUT_RATE))
# # model.add(Dense(HIDDEN_DIM, activation='relu'))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Dense(NUM_CLASSES, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

In [41]:
# !pip install tf2crf

In [42]:
# # from sklearn.feature_extraction.text import TfidfVectorizer
# from keras.models import Model
# from keras.layers import Dense, Conv1D
# from tf2crf import CRF, ModelWithCRFLoss
# from keras import Input

# # Build CNN model
# # model = Sequential()
# inputs = Input(shape=(MAX_LENGTH, EMBEDDING_DIM))
# outputs = Conv1D(64, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# # model.add(MaxPooling1D(2, padding='same'))
# # outputs = Dropout(DROPOUT_RATE)(inputs)
# outputs = Conv1D(32, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# # model.add(MaxPooling1D(2))
# # model.add(Dropout(DROPOUT_RATE))
# # model.add(Dense(HIDDEN_DIM, activation='relu'))
# # outputs = Dropout(DROPOUT_RATE)(outputs)
# outputs = Dense(NUM_CLASSES, activation='relu')(outputs)
# # outputs.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# # outputs.summary()
# crf = CRF(units=9)
# # cnn_model.add(crf)
# output = crf(outputs)
# cnn_crf_model = Model(inputs, output)
# cnn_crf_model.summary()
# # cnn_crf_model = ModelWithCRFLoss(base_model, sparse_target=True)
# # cnn_crf_model.summary()

In [43]:
# cnn_crf_model.compile(optimizer='adam')
# cnn_crf_model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

In [44]:
# # Evaluation
# loss, accuracy = cnn_crf_model.evaluate(test.x, test.y, batch_size=BATCH_SIZE)

# print('Test Loss:', loss)
# print('Test Accuracy:', accuracy)

In [45]:
# y_predict_cnn_crf = cnn_crf_model.predict(test.x)

In [46]:
# y_predict_cnn_crf.shape
# print(len(valid.unique_ner_tags))

In [47]:
# cnn_crf_model.export("../data/model_cnn.keras")

In [48]:
# cnn_model = tf.keras.models.load_model("model_cnn.keras")

## Model LSTM

In [65]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the model architecture
model_LSTM = Sequential()
model_LSTM.add(LSTM(128, input_shape=(MAX_LENGTH, EMBEDDING_DIM), return_sequences=True))
model_LSTM.add(LSTM(64, return_sequences=True))
model_LSTM.add(Dense(9, activation='softmax'))

model_LSTM.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_6 (LSTM)               (None, 200, 128)          117248    
                                                                 
 lstm_7 (LSTM)               (None, 200, 64)           49408     
                                                                 
 dense_3 (Dense)             (None, 200, 9)            585       
                                                                 
Total params: 167,241
Trainable params: 167,241
Non-trainable params: 0
_________________________________________________________________


In [66]:
# !pip install tensorflow-addons

In [67]:
# Compile the model
import tensorflow_addons as tfa

def loss(y_true, y_pred):
  # print(y_true)
  # print(y_pred)
  # b = tf.keras.losses.BinaryCrossentropy()
  # b = tf.keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, gamma=3)
  b = tfa.losses.SigmoidFocalCrossEntropy()
  return b(y_true, y_pred)
# 'categorical_crossentropy'
model_LSTM.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

# Train the model
model_LSTM.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1d0bb25cf0>

In [51]:
# model_LSTM.save("../Data/model_lstm.keras")

In [52]:
# model_LSTM = tf.keras.models.load_model("../Data/model_lstm.keras")

In [68]:
# Evaluation
loss, accuracy = model_LSTM.evaluate(test.x, test.y, batch_size=32)

print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.012116237543523312
Test Accuracy: 0.9872516393661499


In [69]:
y_predict_lstm = model_LSTM.predict(test.x, batch_size=BATCH_SIZE)
y_predict_lstm.shape



(3453, 200, 9)

In [55]:
Data.unique_ner_tags

{'I-ORG': 0,
 'I-LOC': 1,
 'I-PER': 2,
 'B-LOC': 3,
 'I-MISC': 4,
 'B-PER': 5,
 'B-ORG': 6,
 'O': 7,
 'B-MISC': 8}

In [70]:
true = 0
false = 0
total = 0 
x, y, z = test.y.shape
for i in range(x):
    for j in range(y):
        real_tag = np.argmax(test.y[i][j]) 
        predict_tag = np.argmax(y_predict_lstm[i][j])
        if test.idx2tag(real_tag) != "O":
          total += 1
          if real_tag == predict_tag: true += 1
          else: false += 1
print(true, false, total, true/total, false/total)

814 7298 8112 0.10034516765285996 0.89965483234714
