In [1]:
!git clone https://github.com/taslimamindia/NERC.git

Cloning into 'NERC'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 34 (delta 7), reused 31 (delta 4), pack-reused 0[K
Unpacking objects: 100% (34/34), 1.36 MiB | 5.31 MiB/s, done.


# Importation

In [1]:
import pandas as pd

import numpy as np

from nltk import word_tokenize, sent_tokenize, download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

import tensorflow as tf

from keras.layers import Embedding
from keras.utils import to_categorical, pad_sequences

In [3]:
# download('wordnet') # for google colab

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Class define form data.

In [2]:
class Data(object):
    unique_words = {}
    unique_ner_tags = {}
    MAX_LENGTH = 200
    def __init__(self):
        self.sentences = []
        self.sentences_num = None
        self.ner_tags = []
        self.ner_tags_num = None
        self.chunk_tags = []
        self.pos_tags = []
        self.x, self.y = None, None
    def word2vec(self, vector_size=100):
        word2vec_model = Word2Vec(self.sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
        return word2vec_model   
    def word2idx(self, word:str):
        return Data.unique_words.get(word, None)
    def idx2word(self, index:int):
        for word, value in Data.unique_words.items():
            if index is value: return word
        return None    
    def tag2idx(self, tag):
        return Data.unique_ner_tags.get(tag, None)
    def idx2tag(self, index):
        for tag, value in Data.unique_ner_tags.items():
            if index == value: return tag
        return None

# Loading data

In [3]:
class Loading():
    def __init__(self, data: Data, file):
        self.data = data
        self.load_sentences(file)
        print("Loading successfully")
    def load_sentences(self, filepath):
        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
        with open(filepath, 'r') as f:
            for line in f.readlines():
                if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                    if len(tokens) > 0:
                        self.data.sentences.append(tokens)
                        self.data.pos_tags.append(pos_tags)
                        self.data.chunk_tags.append(chunk_tags)
                        self.data.ner_tags.append(ner_tags)
                        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
                else:
                    l = line.split(' ')
                    tokens.append(l[0])
                    pos_tags.append(l[1])
                    chunk_tags.append(l[2])
                    ner_tags.append(l[3].strip('\n'))

# Preprocessing

In [4]:
class Preprocessing():
    def __init__(self, data:Data, text=None, lang="english"):
        self.data = data
        self.text = text
        self.lang = lang
    def tokenize(self):
        if self.text != None:
            sentenses = [word_tokenize(sentence, language=self.lang) for sentence in sent_tokenize(self.text, language=self.lang)]
            self.data.sentences = [[token for token in sentence if token not in stopwords.words(self.lang)] for sentence in sentenses]
    def lowercasing(self):
        self.data.sentences = [[word.lower() for word in sentence] for sentence in self.data.sentences]
    def lemmatize(self):
        lemmatizer = WordNetLemmatizer()
        self.data.sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in self.data.sentences]
        self.unicity() # must be changed
    def unicity(self):
        temp = set() 
        [temp.update(word) for word in self.data.sentences]
        Data.unique_words = dict(zip(list(temp), range(1, len(temp) + 1)))
        temp = set() 
        [temp.update(word) for word in self.data.ner_tags]
        Data.unique_ner_tags = dict(zip(list(temp), range(len(temp))))

# Vectorization

In [5]:
class Vectorization():
    def __init__(self, data:Data):
        self.data = data
        # self.data.max_length = max([len(sentence) for sentence in data.sentences])
        data.sentences_num = [[Data.unique_words[word] for word in sentence] for sentence in data.sentences]
        data.ner_tags_num = [[Data.unique_ner_tags[tag] for tag in tags] for tags in data.ner_tags] 
    def padding_x(self):
        if len(self.data.sentences_num) > 0:
            self.data.sentences_num = self.word2vec()
            self.data.x = pad_sequences(
                sequences=self.data.sentences_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=0
            )
    def padding_y(self):
        if len(self.data.ner_tags_num) > 0:
            self.data.y = pad_sequences(
                sequences=self.data.ner_tags_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=self.data.unique_ner_tags.get("O")
            )
    def word2vec(self, min_count=1, vector_size=100, window=5):
        word2vec_model = Word2Vec(self.data.sentences, min_count=min_count, vector_size=vector_size, window=window)
        vectors= [[word2vec_model.wv[word] for word in sentence] for sentence in self.data.sentences]
        return vectors
    def vectorized_x(self):
        self.padding_x() 
        self.data.x = np.array(self.data.x, dtype="float32")
    def vectorized_y(self):
        self.padding_y()
        self.data.y = [[to_categorical(tag, num_classes=len(Data.unique_ner_tags)) for tag in tags] for tags in self.data.y]
        self.data.y = np.array(self.data.y, dtype='float32')

# Main

## Pretraining for CONLL2003

In [6]:

def pretraining_CoNLL3(path: str):
    data = Data()
    base_file = "../Data/conll2003_english/"
    # base_file = "/content/NERC/Data/conll2003_english/"
    Loading(data = data, file=base_file + path)
    preprocessing = Preprocessing(data=data)
    preprocessing.lowercasing()
    preprocessing.lemmatize()
    vector = Vectorization(data=data)
    vector.vectorized_x()
    vector.vectorized_y()
    return data

## Define Trainset

In [7]:
train = pretraining_CoNLL3("train.txt")

Loading successfully


In [8]:
print(train.sentences[0])
print(train.ner_tags[0], "\n")
print("X_train", train.x.shape)
print("y_train", train.y.shape, "\n")
print(type(train.x))

['eu', 'reject', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] 

X_train (14041, 200, 100)
y_train (14041, 200, 9) 

<class 'numpy.ndarray'>


## Define Testset

In [9]:
test = pretraining_CoNLL3("test.txt")
print("X_test", test.x.shape)
print("y_test", test.y.shape)

Loading successfully
X_test (3453, 200, 100)
y_test (3453, 200, 9)


## Define ValidSet

In [10]:
valid = pretraining_CoNLL3("valid.txt")
print("X_valid", valid.x.shape)
print("y_valid", valid.y.shape)

Loading successfully
X_valid (3250, 200, 100)
y_valid (3250, 200, 9)


## New input text

In [None]:
# test_text = Data()

# preprocessing = Preprocessing(data = test_text, text = "Obama is the president of the United States. I am from Guinea, nice to meet you.")
# preprocessing.tokenize()
# preprocessing.lowercasing()
# preprocessing.lemmatize()
# print(test_text.sentences)

# vector = Vectorization(test_text)
# vector.vectorized_x()
# print(test_text.x.shape)

# Trainning

In [11]:
NUM_WORDS = len(Data.unique_words)
NUM_CLASSES = len(Data.unique_ner_tags)
MAX_LENGTH = Data.MAX_LENGTH
# Hyperparameters
EMBEDDING_DIM = 100
NUM_FILTERS = 256
KERNEL_SIZE = 3
DROPOUT_RATE = 0.5
BATCH_SIZE = 32
EPOCHS = 10
print(MAX_LENGTH, NUM_CLASSES, NUM_WORDS)

200 9 8419


In [None]:
# # from sklearn.feature_extraction.text import TfidfVectorizer
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D


# # Build CNN model
# model = Sequential()
# model.add(Conv1D(64, KERNEL_SIZE, activation='relu', input_shape=(MAX_LENGTH, EMBEDDING_DIM), padding='same'))
# # model.add(MaxPooling1D(2, padding='same'))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Conv1D(32, KERNEL_SIZE, activation='relu', padding='same'))
# # model.add(MaxPooling1D(2))
# # model.add(Dropout(DROPOUT_RATE))
# # model.add(Dense(HIDDEN_DIM, activation='relu'))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Dense(NUM_CLASSES, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

In [None]:
# !pip install tf2crf

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Model
from keras.layers import Dense, Conv1D
from tf2crf import CRF, ModelWithCRFLoss
from keras import Input

# Build CNN model
# model = Sequential()
inputs = Input(shape=(MAX_LENGTH, EMBEDDING_DIM))
outputs = Conv1D(64, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# model.add(MaxPooling1D(2, padding='same'))
# outputs = Dropout(DROPOUT_RATE)(inputs)
outputs = Conv1D(32, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# model.add(MaxPooling1D(2))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Dense(HIDDEN_DIM, activation='relu'))
# outputs = Dropout(DROPOUT_RATE)(outputs)
outputs = Dense(NUM_CLASSES, activation='relu')(outputs)
# outputs.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# outputs.summary()
crf = CRF(units=9)
# cnn_model.add(crf)
output = crf(outputs)
cnn_crf_model = Model(inputs, output)
cnn_crf_model.summary()
# cnn_crf_model = ModelWithCRFLoss(base_model, sparse_target=True)
# cnn_crf_model.summary()

In [None]:
cnn_crf_model.compile(optimizer='adam')
cnn_crf_model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

In [None]:
# Evaluation
loss, accuracy = cnn_crf_model.evaluate(test.x, test.y, batch_size=BATCH_SIZE)

print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

In [None]:
y_predict_cnn_crf = cnn_crf_model.predict(test.x)

In [None]:
y_predict_cnn_crf.shape
print(len(valid.unique_ner_tags))

In [None]:
# cnn_crf_model.export("../data/model_cnn.keras")

In [None]:
# cnn_model = tf.keras.models.load_model("model_cnn.keras")

## Model LSTM

In [15]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the model architecture
model_LSTM = Sequential()
model_LSTM.add(LSTM(128, input_shape=(MAX_LENGTH, EMBEDDING_DIM), return_sequences=True))
model_LSTM.add(LSTM(64, return_sequences=True))
model_LSTM.add(Dense(9, activation='softmax'))

model_LSTM.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 200, 128)          117248    
                                                                 
 lstm_1 (LSTM)               (None, 200, 64)           49408     
                                                                 
 dense (Dense)               (None, 200, 9)            585       
                                                                 
Total params: 167,241
Trainable params: 167,241
Non-trainable params: 0
_________________________________________________________________


In [16]:
# Compile the model
model_LSTM.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model_LSTM.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [12]:
model_LSTM = tf.keras.models.load_model("../Data/model_lstm.keras")

In [14]:
# Evaluation
loss, accuracy = model_LSTM.evaluate(test.x, test.y, batch_size=32)

print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 15.438606262207031
Test Accuracy: 0.00039241238846443594


In [64]:
y_predict_lstm = model_LSTM.predict(test.x, batch_size=BATCH_SIZE)
y_predict_lstm.shape



(3453, 200, 9)

In [81]:
# k = 2
# tags = test.ner_tags[k]
# y = test.y[k]
# y_pred = y_predict_lstm[k]
# for i in range(len(tags)):
#     # if tags[i] != 'O':
#     print(np.argmax(y_pred[i]), test.idx2tag(np.argmax(y_pred[i])), sep="   ")
#     print(tags[i], y[i], sep="   ")
true = 0
false = 0
total = 0    
for i in range(len(train.ner_tags)):
    for j in range(len(train.ner_tags[i])):
        total += 1
        y_real = np.argmax(test.y[i][j])
        y_predict = np.argmax(y_predict_lstm[i][j])
        if y_real == y_predict: true += 1
        else: false += 1
print(true, false, total, true/total, false/total)

1281 519 1800 0.7116666666666667 0.28833333333333333


In [39]:
y_predict_lstm[0][0]

array([2.4998310e-01, 6.2051479e-04, 5.7046771e-02, 1.5939010e-04,
       6.6610635e-04, 2.6537925e-03, 4.9815875e-01, 5.7965193e-02,
       1.3274638e-01], dtype=float32)