In [43]:
# !git clone https://github.com/taslimamindia/NERC.git

# Importation

In [44]:
import pandas as pd

import numpy as np

from nltk import word_tokenize, sent_tokenize, download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

import tensorflow as tf

from keras.layers import Embedding
from keras.utils import to_categorical, pad_sequences

In [45]:
# download('wordnet') # for google colab

# Class define form data.

In [46]:
class Data(object):
    unique_words = {"<PAD>":0}
    unique_ner_tags = {"O":0}
    MAX_LENGTH = 50
    VOCAB_SIZE = 100
    
    def __init__(self):
        self.sentences = []
        self.sentences_num = None
        self.ner_tags = []
        self.ner_tags_num = None
        self.chunk_tags = []
        self.pos_tags = []
        self.x, self.y = None, None
    def word2vec(self, vector_size=100):
        VOCAB_SIZE = vector_size
        word2vec_model = Word2Vec(self.sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
        return word2vec_model   
    def word2idx(self, word:str):
        return Data.unique_words.get(word, None)
    def idx2word(self, index:int):
        for word, value in Data.unique_words.items():
            if index is value: return word
        return None    
    def tag2idx(self, tag):
        return Data.unique_ner_tags.get(tag, None)
    def idx2tag(self, index):
        for tag, value in Data.unique_ner_tags.items():
            if index == value: return tag
        return None
    def unicity(self):
        unique_sent, unique_tag = set(), set()
        [unique_tag.update(tags) for tags in self.ner_tags_num]
        [unique_sent.update(tags) for tags in self.sentences_num]
        max_tags = len(Data.unique_ner_tags)
        max_words = len(Data.unique_words)
        for word in list(unique_sent):
            if Data.unique_words.get(word, None) == None:
                Data.unique_words[word] = max_words
                max_words += 1
        for tag in list(unique_tag):
            if Data.unique_ner_tags.get(tag, None) == None:
                Data.unique_ner_tags[tag] = max_tags
                max_tags += 1


# Loading data

In [47]:
class Loading():
    def __init__(self, data: Data, file):
        self.data = data
        self.load_sentences(file)
    def load_sentences(self, filepath):
        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
        with open(filepath, 'r') as f:
            for line in f.readlines():
                if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                    if len(tokens) > 0:
                        self.data.sentences.append(tokens)
                        self.data.pos_tags.append(pos_tags)
                        self.data.chunk_tags.append(chunk_tags)
                        self.data.ner_tags.append(ner_tags)
                        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
                else:
                    l = line.split(' ')
                    tokens.append(l[0])
                    pos_tags.append(l[1])
                    chunk_tags.append(l[2])
                    ner_tags.append(l[3].strip('\n'))

# Preprocessing

In [48]:
class Preprocessing():
    def __init__(self, data:Data, text=None, lang="english"):
        self.data = data
        self.text = text
        self.lang = lang
        if text == None:
          self.data.sentences_num = self.data.sentences
          self.data.ner_tags_num = self.data.ner_tags
    def tokenize(self):
        if self.text != None:
            sentenses = [word_tokenize(sentence, language=self.lang) for sentence in sent_tokenize(self.text, language=self.lang)]
            self.data.sentences = [[token for token in sentence if token not in stopwords.words(self.lang)] for sentence in sentenses]
            self.data.sentences_num = self.data.sentences
    def remove_stopword(self):
        sentences = []
        for i in range(len(self.data.sentences)):
            sentence = []
            for j in range(len(self.data.sentences[i])):
                if self.data.sentences[i][j] not in stopwords.words(self.lang):
                    sentence.append((self.data.sentences[i][j], self.data.ner_tags[i][j]))
            sentences.append(sentence)
        self.data.sentences = [[token[0] for token in sentence ] for sentence in sentences]
        self.data.ner_tags = [[tag[1] for tag in tags ] for tags in sentences]
        self.data.sentences_num = self.data.sentences
        self.data.ner_tags_num = self.data.ner_tags
    def lowercasing(self):
        self.data.sentences_num = [[word.lower() for word in sentence] for sentence in self.data.sentences_num]
    def lemmatize(self):
        lemmatizer = WordNetLemmatizer()
        self.data.sentences_num = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in self.data.sentences_num]

# Vectorization

In [49]:
class Vectorization():
    def __init__(self, data:Data):
        self.data = data
    def word2vec(self, min_count=1, window=5):
        word2vec_model = Word2Vec(self.data.sentences_num, min_count=min_count, vector_size=Data.VOCAB_SIZE, window=window)
        self.data.sentences_num = [[word2vec_model.wv[word] for word in sentence] for sentence in self.data.sentences_num]
    def padding_x(self):
        if len(self.data.sentences_num) > 0:
            self.data.x = pad_sequences(
                sequences=self.data.sentences_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=np.zeros((Data.VOCAB_SIZE,), dtype="float32")
            )
    def vectorized_x(self):
        self.word2vec()
        self.padding_x()
        
    def padding_y(self):
        if len(self.data.ner_tags_num) > 0:
            self.data.ner_tags_num = [[Data.unique_ner_tags.get(tag) for tag in tags] for tags in self.data.ner_tags_num]            
            self.data.ner_tags_num = pad_sequences(
                sequences=self.data.ner_tags_num, 
                maxlen=self.data.MAX_LENGTH,
                padding="post", 
                dtype="str",
                value=Data.unique_ner_tags.get("O")
            )
    def word2num(self):
        x, y = self.data.ner_tags_num.shape
        NUM_CLASSES = len(Data.unique_ner_tags)
        self.data.y = np.zeros((x, y, NUM_CLASSES))
        for i in range(x):
            for j in range(y):
                self.data.y[i][j] = to_categorical(self.data.ner_tags_num[i][j], num_classes=NUM_CLASSES)
    def vectorized_y(self):
        self.padding_y()
        self.word2num()        

# Main

## Pretraining for CONLL2003

In [50]:
def pretraining_CoNLL3(path: str):
    data = Data()
    base_file = "../Data/conll2003_english/"
    # base_file = "/content/NERC/Data/conll2003_english/"
    Loading(data = data, file=base_file + path)
    return data

def load(data:Data):
    preprocessing = Preprocessing(data=data)
    preprocessing.remove_stopword()
    preprocessing.lowercasing()
    preprocessing.lemmatize()
    data.unicity()
    
def vectorize(data:Data):
    vector = Vectorization(data=data)
    vector.vectorized_x()
    vector.vectorized_y()


## Define Dataset

## New input text

In [51]:
# test_text = Data()

# preprocessing = Preprocessing(data = test_text, text = "Obama is the president of the United States. I am from Guinea, nice to meet you.")
# preprocessing.tokenize()
# preprocessing.lowercasing()
# preprocessing.lemmatize()
# print(test_text.sentences)

# vector = Vectorization(test_text)
# vector.vectorized_x()
# print(test_text.x.shape)

# Trainning

In [52]:
NUM_WORDS = len(Data.unique_words)
NUM_CLASSES = len(Data.unique_ner_tags)
# Hyperparameters
EMBEDDING_DIM = 100
NUM_FILTERS = 256
KERNEL_SIZE = 3
DROPOUT_RATE = 0.5
BATCH_SIZE = 32
EPOCHS = 10

### CNN model

In [53]:
# # from sklearn.feature_extraction.text import TfidfVectorizer
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D


# # Build CNN model
# model = Sequential()
# model.add(Conv1D(64, KERNEL_SIZE, activation='relu', input_shape=(MAX_LENGTH, EMBEDDING_DIM), padding='same'))
# # model.add(MaxPooling1D(2, padding='same'))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Conv1D(32, KERNEL_SIZE, activation='relu', padding='same'))
# # model.add(MaxPooling1D(2))
# # model.add(Dropout(DROPOUT_RATE))
# # model.add(Dense(HIDDEN_DIM, activation='relu'))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Dense(NUM_CLASSES, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

In [54]:
# !pip install tf2crf

In [55]:
# # from sklearn.feature_extraction.text import TfidfVectorizer
# from keras.models import Model
# from keras.layers import Dense, Conv1D
# from tf2crf import CRF, ModelWithCRFLoss
# from keras import Input

# # Build CNN model
# # model = Sequential()
# inputs = Input(shape=(MAX_LENGTH, EMBEDDING_DIM))
# outputs = Conv1D(64, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# # model.add(MaxPooling1D(2, padding='same'))
# # outputs = Dropout(DROPOUT_RATE)(inputs)
# outputs = Conv1D(32, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# # model.add(MaxPooling1D(2))
# # model.add(Dropout(DROPOUT_RATE))
# # model.add(Dense(HIDDEN_DIM, activation='relu'))
# # outputs = Dropout(DROPOUT_RATE)(outputs)
# outputs = Dense(NUM_CLASSES, activation='relu')(outputs)
# # outputs.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# # outputs.summary()
# crf = CRF(units=9)
# # cnn_model.add(crf)
# output = crf(outputs)
# cnn_crf_model = Model(inputs, output)
# cnn_crf_model.summary()
# # cnn_crf_model = ModelWithCRFLoss(base_model, sparse_target=True)
# # cnn_crf_model.summary()

In [56]:
# cnn_crf_model.compile(optimizer='adam')
# cnn_crf_model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

In [57]:
# # Evaluation
# loss, accuracy = cnn_crf_model.evaluate(test.x, test.y, batch_size=BATCH_SIZE)

# print('Test Loss:', loss)
# print('Test Accuracy:', accuracy)

In [58]:
# y_predict_cnn_crf = cnn_crf_model.predict(test.x)

In [59]:
# y_predict_cnn_crf.shape
# print(len(valid.unique_ner_tags))

In [60]:
# cnn_crf_model.export("../data/model_cnn.keras")

In [61]:
# cnn_model = tf.keras.models.load_model("model_cnn.keras")

## Model LSTM

In [62]:
from keras.backend import dropout
from keras.models import Sequential
from keras.layers import LSTM, Dense
# import tensorflow_addons as tfa

class Model_LSTM:
  def __init__(self):
    # Define the model architecture
    self.model_LSTM = Sequential()
    self.model_LSTM.add(LSTM(256, input_shape=(Data.MAX_LENGTH, Data.VOCAB_SIZE), return_sequences=True, dropout=0.5))
    self.model_LSTM.add(LSTM(128, return_sequences=True, dropout=0.5))
    self.model_LSTM.add(LSTM(64, return_sequences=True, dropout=0.5))
    self.model_LSTM.add(LSTM(32, return_sequences=True, dropout=0.5))
    self.model_LSTM.add(Dense(9, activation='softmax'))
  def summary(self):
    self.model_LSTM.summary()
  def trainning(self, train:Data, valid:Data=None):
    cat_accuracy = tf.keras.metrics.CategoricalAccuracy()
    recall = tf.keras.metrics.Recall()
    self.model_LSTM.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[cat_accuracy, recall])
    if valid == None:
      self.model_LSTM.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS)
    else:
      self.model_LSTM.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))
  def testing(self, test:Data):
    return self.model_LSTM.evaluate(test.x, test.y)
  def predicting(self, test:Data):
    return self.model_LSTM.predict(test.x, batch_size=BATCH_SIZE)


In [63]:
# !pip install tensorflow-addons

In [64]:
# model_LSTM.save("../Data/model_lstm.keras")

In [65]:
# model_LSTM = tf.keras.models.load_model("../Data/model_lstm.keras")

In [66]:
# entities = dict(zip(Data.unique_ner_tags.keys(), [0 for i in range(len(Data.unique_ner_tags))]))
# for tags in test.ner_tags:
#     for tag in tags:
#         entities[tag] += 1
# is_entities = 0
# is_not_entities = 0
# for tag, nbr in entities.items():
#     if tag != 'O': is_entities += nbr
#     else: is_not_entities += nbr
# print(entities)
# print(is_entities, is_not_entities)

In [67]:
def evaluation(test:Data, y_predict_lstm):
  true, false, total, predict = 0, 0, 0, 0
  x, y, z = test.y.shape
  for i in range(x):
    for j in range(y):
      real_tag = np.argmax(test.y[i][j]) 
      predict_tag = np.argmax(y_predict_lstm[i][j])
      if predict_tag == 0: predict +=1
      if real_tag != 0:
        total = total + 1
        if real_tag == predict_tag: true = true + 1
        else: false = false + 1
  print("----------------------- Evaluation -------------------------")
  print(test.y.shape)
  print(predict, x*y)
  print(true, false, total, round(true/total, 3), round(false/total, 3), end="\n\n")

In [71]:
def loadData(param:dict):
  dico = {"params":[], "metrics":[]}
  if param.get("max_length", 0) != 0:
      max_lengths = param["max_length"]
      for max_length in max_lengths:   
        Data.MAX_LENGTH = max_length     
        train = pretraining_CoNLL3("train.txt")
        test = pretraining_CoNLL3("test.txt")
        valid = pretraining_CoNLL3("valid.txt")
        load(train)
        load(test)
        load(valid)
        vectorize(train)
        vectorize(test)
        vectorize(valid)
        model_lstm = Model_LSTM()
        model_lstm.trainning(train, valid)
        model_lstm.testing(test)
        y_predict_lstm = model_lstm.predicting(test)
        evaluation(test, y_predict_lstm)
        return train, test, valid
        

def checkDataset(train, test, valid):    
    print("X_train", train.x.shape)
    print("y_train", train.y.shape, "\n")
    print("X_test", test.x.shape)
    print("y_test", test.y.shape, "\n")    
    print("X_valid", valid.x.shape)
    print("y_valid", valid.y.shape)

In [72]:
train, test, valid = loadData({"max_length":[100]})

KeyboardInterrupt: 

In [70]:
# np.quantile(sort([1, 2, 3, 8, 7]), 0.50)
dico = {}
for tags in test.sentences + train.sentences + valid.sentences:
  if dico.get(len(tags), None) == None:
    dico[len(tags)] = 1
  dico[len(tags)] += 1
sorted(list(dico.items()), key= lambda x: x[1])

[(66, 2),
 (71, 2),
 (72, 2),
 (124, 2),
 (80, 2),
 (59, 2),
 (77, 2),
 (50, 2),
 (62, 2),
 (78, 2),
 (96, 2),
 (109, 2),
 (93, 2),
 (49, 2),
 (52, 2),
 (91, 2),
 (105, 2),
 (83, 2),
 (79, 2),
 (42, 3),
 (48, 3),
 (64, 3),
 (45, 3),
 (46, 3),
 (51, 4),
 (43, 4),
 (44, 4),
 (41, 5),
 (39, 6),
 (40, 7),
 (38, 11),
 (36, 17),
 (37, 18),
 (35, 26),
 (34, 29),
 (33, 40),
 (32, 58),
 (31, 85),
 (30, 86),
 (29, 110),
 (28, 134),
 (27, 198),
 (26, 218),
 (1, 239),
 (25, 263),
 (24, 296),
 (23, 335),
 (22, 404),
 (21, 411),
 (20, 438),
 (19, 459),
 (17, 481),
 (18, 503),
 (16, 539),
 (15, 607),
 (14, 613),
 (13, 629),
 (12, 659),
 (11, 700),
 (10, 799),
 (3, 1026),
 (5, 1122),
 (6, 1173),
 (4, 1294),
 (9, 1324),
 (2, 1719),
 (8, 1814),
 (7, 1852)]