In [1]:
!git clone https://github.com/taslimamindia/NERC.git

Cloning into 'NERC'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 23 (delta 4), reused 21 (delta 2), pack-reused 0[K
Unpacking objects: 100% (23/23), 1.07 MiB | 4.19 MiB/s, done.


In [19]:
cd /content/NERC

/content/NERC


# Importation

In [24]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

import tensorflow as tf

from keras.layers import Embedding

from keras.utils import to_categorical, pad_sequences

In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Class define form data.

In [26]:
class Data(object):
    unique_words = {}
    unique_ner_tags = {}
    MAX_LENGTH = 200
    def __init__(self):
        self.sentences = []
        self.sentences_num = None
        self.ner_tags = []
        self.ner_tags_num = None
        self.chunk_tags = []
        self.pos_tags = []
        self.x, self.y = None, None
    def word2vec(self, vector_size=100):
        word2vec_model = Word2Vec(self.sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
        return word2vec_model   
    def word2idx(self, word):
        pass
    def idx2word(self, index):
        pass
    def tag2idx(self, tag):
        pass
    def idx2tag(self, index):
        pass

# Loading data

In [27]:
class Loading():
    def __init__(self, data: Data, file):
        self.data = data
        self.load_sentences(file)
        print("Loading successfully")
    def load_sentences(self, filepath):
        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
        with open(filepath, 'r') as f:
            for line in f.readlines():
                if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                    if len(tokens) > 0:
                        self.data.sentences.append(tokens)
                        self.data.pos_tags.append(pos_tags)
                        self.data.chunk_tags.append(chunk_tags)
                        self.data.ner_tags.append(ner_tags)
                        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
                else:
                    l = line.split(' ')
                    tokens.append(l[0])
                    pos_tags.append(l[1])
                    chunk_tags.append(l[2])
                    ner_tags.append(l[3].strip('\n'))

# Preprocessing

In [28]:
class Preprocessing():
    def __init__(self, data:Data, text=None, lang="english"):
        self.data = data
        self.text = text
        self.lang = lang
    def tokenize(self):
        if self.text != None:
            sentenses = [word_tokenize(sentence, language=self.lang) for sentence in sent_tokenize(self.text, language=self.lang)]
            self.data.sentences = [[token for token in sentence if token not in stopwords.words(self.lang)] for sentence in sentenses]
    def lowercasing(self):
        self.data.sentences = [[word.lower() for word in sentence] for sentence in self.data.sentences]
    def lemmatize(self):
        lemmatizer = WordNetLemmatizer()
        self.data.sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in self.data.sentences]
        self.unicity() # must be changed
    def unicity(self):
        temp = set() 
        [temp.update(word) for word in self.data.sentences]
        Data.unique_words = dict(zip(list(temp), range(1, len(temp) + 1)))
        temp = set() 
        [temp.update(word) for word in self.data.ner_tags]
        Data.unique_ner_tags = dict(zip(list(temp), range(len(temp))))

# Vectorization

In [29]:
class Vectorization():
    def __init__(self, data:Data):
        self.data = data
        # self.data.max_length = max([len(sentence) for sentence in data.sentences])
        data.sentences_num = [[Data.unique_words[word] for word in sentence] for sentence in data.sentences]
        data.ner_tags_num = [[Data.unique_ner_tags[tag] for tag in tags] for tags in data.ner_tags] 
    def padding_x(self):
        if len(self.data.sentences_num) > 0:
            self.data.sentences_num = self.word2vec()
            self.data.x = pad_sequences(
                sequences=self.data.sentences_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=0
            )
    def padding_y(self):
        if len(self.data.ner_tags_num) > 0:
            self.data.y = pad_sequences(
                sequences=self.data.ner_tags_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=self.data.unique_ner_tags.get("O")
            )
    def word2vec(self, min_count=1, vector_size=100, window=5):
        word2vec_model = Word2Vec(self.data.sentences, min_count=min_count, vector_size=vector_size, window=window)
        vectors= [[word2vec_model.wv[word] for word in sentence] for sentence in self.data.sentences]
        return vectors
    def vectorized_x(self):
        self.padding_x() 
        self.data.x = np.array(self.data.x, dtype="float32")
    def vectorized_y(self):
        self.padding_y()
        self.data.y = [[to_categorical(tag, num_classes=len(Data.unique_ner_tags)) for tag in tags] for tags in self.data.y]
        self.data.y = np.array(self.data.y, dtype='float32')

# Main

## Pretraining for CONLL2003

In [30]:

def pretraining_CoNLL3(path: str):
    data = Data()
    base_file = "conll2003_english/"
    # base_file = "/content/NERC/conll2003_english/"
    Loading(data = data, file=base_file + path)
    preprocessing = Preprocessing(data=data)
    preprocessing.lowercasing()
    preprocessing.lemmatize()
    vector = Vectorization(data=data)
    vector.vectorized_x()
    vector.vectorized_y()
    return data
    

    # Loading(data = test, file=base_file + "test.txt")

## Define Trainset

In [31]:
train = pretraining_CoNLL3("train.txt")
print(train.sentences[0])
print(train.ner_tags[0])

Loading successfully
['eu', 'reject', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [32]:
print(train.sentences[0])
print(train.ner_tags[0])

['eu', 'reject', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [33]:
print("X_train", train.x.shape)
print("y_train", train.y.shape)
print(type(train.x))

X_train (14041, 200, 100)
y_train (14041, 200, 9)
<class 'numpy.ndarray'>


## Define Testset

In [34]:
test = pretraining_CoNLL3("test.txt")
print("X_test", test.x.shape)
print("y_test", test.y.shape)

Loading successfully
X_test (3453, 200, 100)
y_test (3453, 200, 9)


## Define ValidSet

In [37]:
valid = pretraining_CoNLL3("valid.txt")
print("X_valid", valid.x.shape)
print("y_valid", valid.y.shape)

Loading successfully
X_valid (3250, 200, 100)
y_valid (3250, 200, 9)


## New input text

In [None]:
test_text = Data()

preprocessing = Preprocessing(data = test_text, text = "Obama is the president of the United States. I am from Guinea, nice to meet you.")
preprocessing.tokenize()
preprocessing.lowercasing()
preprocessing.lemmatize()
print(test_text.sentences)

vector = Vectorization(test_text)
vector.vectorized_x()
print(test_text.x.shape)

# Trainning

In [35]:
NUM_WORDS = len(Data.unique_words)
NUM_CLASSES = len(Data.unique_ner_tags)
MAX_LENGTH = Data.MAX_LENGTH
OUTPUT_DIM = 100
print(MAX_LENGTH, NUM_CLASSES, NUM_WORDS, OUTPUT_DIM)
# Hyperparameters
EMBEDDING_DIM = 100
NUM_FILTERS = 256
KERNEL_SIZE = 3
HIDDEN_DIM = 200
DROPOUT_RATE = 0.5
BATCH_SIZE = 32
EPOCHS = 10


200 9 8050 100


In [38]:
# from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D


# Build CNN model
model = Sequential()
model.add(Conv1D(64, KERNEL_SIZE, activation='relu', input_shape=(MAX_LENGTH, EMBEDDING_DIM), padding='same'))
# model.add(MaxPooling1D(2, padding='same'))
model.add(Dropout(DROPOUT_RATE))
model.add(Conv1D(32, KERNEL_SIZE, activation='relu', padding='same'))
# model.add(MaxPooling1D(2))
# model.add(Dropout(DROPOUT_RATE))
# model.add(Dense(HIDDEN_DIM, activation='relu'))
model.add(Dropout(DROPOUT_RATE))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()
# Train CNN model
model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f278e4a3850>

In [39]:
model.save("model_cnn.keras")

In [40]:
_model = tf.keras.models.load_model("model_cnn.keras")cnn

In [42]:
# pip install git+https://www.github.com/keras-team/keras-contrib.git # For google colab.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-5lzsr_mu
  Running command git clone --filter=blob:none --quiet https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-5lzsr_mu
  Resolved https://www.github.com/keras-team/keras-contrib.git to commit 3fc5ef709e061416f4bc8a92ca3750c824b5d2b0
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-py3-none-any.whl size=101078 sha256=55a2a0d40a72b548f1f5b128d839fdee9220ebac1f1d7f8d10da165a59deca28
  Stored in directory: /tmp/pip-ephem-wheel-cache-viuln86z/wheels/83/b3/99/125cd08d3e8224b434b1b4badd6f8b2651e0251b7e8f983e60
Successfully built keras-co

In [43]:
from keras_contrib.layers import CRF

# cnn_model.trainable = False 

# Build CRF layer
crf = CRF(NUM_CLASSES, learn_mode='marginal')
cnn_model.add(crf)
cnn_model.compile(loss=crf.loss_function, optimizer='adam', metrics=[crf.accuracy])

cnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 200, 64)           19264     
                                                                 
 dropout_2 (Dropout)         (None, 200, 64)           0         
                                                                 
 conv1d_3 (Conv1D)           (None, 200, 32)           6176      
                                                                 
 dropout_3 (Dropout)         (None, 200, 32)           0         
                                                                 
 dense_1 (Dense)             (None, 200, 9)            297       
                                                                 
 crf (CRF)                   (None, 200, 9)            189       
                                                                 
Total params: 25,926
Trainable params: 25,926
Non-trai



In [47]:
cnn_model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

Epoch 1/10


AttributeError: ignored

## Model LSTM

In [45]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the model architecture
model_LSTM = Sequential()
model_LSTM.add(LSTM(128, input_shape=(MAX_LENGTH, EMBEDDING_DIM), return_sequences=True))
model_LSTM.add(LSTM(64, return_sequences=True))
model_LSTM.add(Dense(9, activation='softmax'))

model_LSTM.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 200, 128)          117248    
                                                                 
 lstm_1 (LSTM)               (None, 200, 64)           49408     
                                                                 
 dense_2 (Dense)             (None, 200, 9)            585       
                                                                 
Total params: 167,241
Trainable params: 167,241
Non-trainable params: 0
_________________________________________________________________


In [46]:
# Compile the model
model_LSTM.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model_LSTM.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f27f5d50fa0>

In [None]:
# Evaluation
loss, accuracy = model.evaluate(test.x, test.y, batch_size=32)

print('Test Loss:', loss)
print('Test Accuracy:', accuracy)