# Importation

In [None]:
import pandas as pd
import numpy as np

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

import tensorflow as tf

from keras.layers import Embedding

from keras.utils import to_categorical, pad_sequences

# Class define form data.

In [None]:
class Data(object):
    unique_words = {}
    unique_ner_tags = {}
    MAX_LENGTH = 200
    def __init__(self):
        self.sentences = []
        self.sentences_num = None
        self.ner_tags = []
        self.ner_tags_num = None
        self.chunk_tags = []
        self.pos_tags = []
        self.x, self.y = None, None
    def word2vec(self, vector_size=100):
        word2vec_model = Word2Vec(self.sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
        return word2vec_model
        

# Loading data

In [None]:
class Loading():
    def __init__(self, data: Data, file):
        self.data = data
        self.load_sentences(file)
        print("Loading successfully")
    def load_sentences(self, filepath):
        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
        with open(filepath, 'r') as f:
            for line in f.readlines():
                if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                    if len(tokens) > 0:
                        self.data.sentences.append(tokens)
                        self.data.pos_tags.append(pos_tags)
                        self.data.chunk_tags.append(chunk_tags)
                        self.data.ner_tags.append(ner_tags)
                        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
                else:
                    l = line.split(' ')
                    tokens.append(l[0])
                    pos_tags.append(l[1])
                    chunk_tags.append(l[2])
                    ner_tags.append(l[3].strip('\n'))

# Preprocessing

In [None]:
class Preprocessing():
    def __init__(self, data:Data, text=None, lang="english"):
        self.data = data
        self.text = text
        self.lang = lang
    def tokenize(self):
        if self.text != None:
            sentenses = [word_tokenize(sentence, language=self.lang) for sentence in sent_tokenize(self.text, language=self.lang)]
            self.data.sentences = [[token for token in sentence if token not in stopwords.words(self.lang)] for sentence in sentenses]
    def lowercasing(self):
        self.data.sentences = [[word.lower() for word in sentence] for sentence in self.data.sentences]
    def lemmatize(self):
        lemmatizer = WordNetLemmatizer()
        self.data.sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in self.data.sentences]
        self.unicity() # must be changed
    def unicity(self):
        temp = set() 
        [temp.update(word) for word in self.data.sentences]
        Data.unique_words = dict(zip(list(temp), range(1, len(temp) + 1)))
        temp = set() 
        [temp.update(word) for word in self.data.ner_tags]
        Data.unique_ner_tags = dict(zip(list(temp), range(len(temp))))

# Vectorization

In [None]:
class Vectorization():
    def __init__(self, data:Data):
        self.data = data
        # self.data.max_length = max([len(sentence) for sentence in data.sentences])
        data.sentences_num = [[Data.unique_words[word] for word in sentence] for sentence in data.sentences]
        data.ner_tags_num = [[Data.unique_ner_tags[tag] for tag in tags] for tags in data.ner_tags] 
    def padding_x(self):
        if len(self.data.sentences_num) > 0:
            self.data.sentences_num = self.word2vec()
            self.data.x = pad_sequences(
                sequences=self.data.sentences_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=0
            )
    def padding_y(self):
        if len(self.data.ner_tags_num) > 0:
            self.data.y = pad_sequences(
                sequences=self.data.ner_tags_num, 
                maxlen=self.data.MAX_LENGTH, 
                dtype="float32", 
                padding="post", 
                value=self.data.unique_ner_tags.get("O")
            )
    def word2vec(self, min_count=1, vector_size=100, window=5):
        word2vec_model = Word2Vec(self.data.sentences, min_count=min_count, vector_size=vector_size, window=window)
        vectors= [[word2vec_model.wv[word] for word in sentence] for sentence in self.data.sentences]
        return vectors
    def vectorized_x(self):
        self.padding_x() 
        self.data.x = np.array(self.data.x, dtype="float32")
    def vectorized_y(self):
        self.padding_y()
        self.data.y = [[to_categorical(tag, num_classes=len(Data.unique_ner_tags)) for tag in tags] for tags in self.data.y]
        self.data.y = np.array(self.data.y, dtype='float32')

# Main

## Trainning dataset

In [None]:
train = Data()
# test = Data()
# valid = Data()

In [None]:
base_file = "conll2003_english/"
Loading(data = train, file=base_file + "train.txt")
# Loading(data = valid, file=base_file + "valid.txt")
print(train.sentences[0])
print(train.ner_tags[0])

In [None]:
preprocessing = Preprocessing(data=train)
preprocessing.lowercasing()
preprocessing.lemmatize()

print(train.sentences[0])
print(train.ner_tags[0])

In [None]:
vector = Vectorization(train)
vector.vectorized_x()
vector.vectorized_y()

print("X_train", train.x.shape)
print("y_train", train.y.shape)

## Testing set

In [None]:
test = Data()

Loading(data = test, file=base_file + "test.txt")

preprocessing = Preprocessing(data=test)
preprocessing.lowercasing()
preprocessing.lemmatize()

vector = Vectorization(test)
vector.vectorized_x()
vector.vectorized_y()

print(test.x.shape, test.y.shape)

## New input text

In [None]:
test_text = Data()

preprocessing = Preprocessing(data = test_text, text = "Obama is the president of the United States. I am from Guinea, nice to meet you.")
preprocessing.tokenize()
preprocessing.lowercasing()
preprocessing.lemmatize()
print(test_text.sentences)

vector = Vectorization(test_text)
vector.vectorized_x()
print(test_text.x.shape)

# Trainning

In [None]:
num_classes = 9
max_length = train.MAX_LENGTH
embedding_dim = 100
input_dim = len(train.sentences)
print(num_classes, max_length, embedding_dim, input_dim)

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Define the model architecture
model = Sequential()
model.add(LSTM(128, input_shape=(Data.MAX_LENGTH, embedding_dim), return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(Dense(9, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train.x, train.y, epochs=10, batch_size=32)

In [None]:
# Evaluation
loss, accuracy = model.evaluate(test.x, test.y, batch_size=32)

print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

# Noisy 

In [None]:
# from keras_contrib.layers import CRF

# word2vec_model = train.word2vec()

# inputs = tf.keras.layers.Input(shape=(max_length, embedding_dim), dtype=tf.float32, name='sequence_input')
# conv1D = tf.keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(inputs)
# maxPooling1D = tf.keras.layers.MaxPooling1D(pool_size=2)(conv1D)
# outputs = tf.keras.layers.Dense(num_classes, activation='relu')(maxPooling1D)
# base = tf.keras.Model(inputs=inputs, outputs=outputs)

# crf_layer = CRF(num_classes, sparse_target=False)
# model = crf_layer(base)

# model.summary()

In [None]:
# model = tf.keras.Sequential([
#   tf.keras.layers.Input(shape=(max_length, 100), dtype=tf.float32, name='sequence_input'),
#   tf.keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'),
#   tf.keras.layers.MaxPooling1D(pool_size=2),
#   tf.keras.layers.Dense(num_classes, activation='relu'),
#   tf.keras.layers.Flatten(),
#   tf.keras.layers.Dense(num_classes, activation='softmax')
# ])

# model.summary()

In [None]:
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.compile(loss=crf_layer.loss_function, optimizer='adam', metrics=[crf_layer.accuracy])


In [None]:
# # Trainning
# batch_size = 56
# num_epochs = 5

# model.fit(train.x, train.y, epochs=num_epochs, batch_size=batch_size)

# # # Evaluation
# # loss, accuracy = model.evaluate(X_test, y_test, batch_size=batch_size)

# # print('Test Loss:', loss)
# # print('Test Accuracy:', accuracy)

In [None]:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(113, 100)))
# model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
# model.add(tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'))
# model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
# model.add(tf.keras.layers.Flatten())
# model.add(tf.keras.layers.Dense(units=128, activation='relu'))
# model.add(tf.keras.layers.Dense(units=9, activation='softmax'))

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# model.fit(train.x, train.y, epochs=10, validation_split=0.2)

In [None]:
# # # Compile the model
# # model.compile(optimizer='adam', loss=tfa.losses.SigmoidFocalCrossEntropy(), metrics=[tfa.metrics.F1Score(num_classes=9, threshold=0.5, dtype='float32')])
# import tensorflow_addons as tfa
# # no need to specify a loss for CRFModel, model will compute crf loss by itself
# # model.compile(optimizer=tf.keras.optimizers.Adam(3e-4), metrics=['acc'])
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(3e-4),
#     loss="categorical_crossentropy",
#     metrics=[tfa.metrics.MultiLabelConfusionMatrix(num_classes=num_classes)]
# )

In [None]:
        # vocab_size = len(word2vec_model.wv)
        # embedding_dim = 100
        # embedding_matrix = np.zeros((vocab_size, embedding_dim))
        # for i, vec in enumerate(word2vec_model.wv):
        #     embedding_matrix[i] = vec
        # embedding_layer = Embedding(
        #     input_dim=vocab_size,
        #     output_dim=embedding_dim,
        #     weights=[embedding_matrix],
        #     trainable=False)
        # return embedding_layer


# class DataSet():
#     def __init__(self):
#         self.labels = {"ner_tags": set(), "pos_tags": set(), "chunk_tags": set()}
#         self.word2vec_model = None
#     def unique_values(self, index = "ner_tags"):
#         if self.labels[index].__len__() > 0:
#             return dict(zip(self.labels[index], range(0, len(self.labels[index]))))
#         raise KeyError("Key does not exist !!!")