In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import os

In [None]:
# df1 = pd.read_csv("../data/ner.csv", encoding="latin1")
df = pd.read_csv("../data/ner_dataset.csv", encoding="latin1")

In [None]:
df.head()

In [None]:
def WordMapper(data):
    tok2idx = {}
    idx2tok = {}

    vocab = list(set(data['Word'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

def TagMapper(data):
    tag2idx = {}
    idx2tag = {}

    tags = list(set(data['Tag'].to_list()))
    
    idx2tag = {idx:tok for  idx, tok in enumerate(tags)}
    tag2idx = {tok:idx for  idx, tok in enumerate(tags)}
    return tag2idx, idx2tag

token2idx, idx2token = WordMapper(df)
tag2idx, idx2tag = TagMapper(df)

In [None]:
df['WordIdx'] = df['Word'].map(token2idx)
df['TagIdx'] = df['Tag'].map(tag2idx)
df.head(10)

In [None]:
data_fillna = df.ffill(axis=0)
data_fillna

In [None]:
data_group = data_fillna.groupby(['Sentence #'], as_index=False).agg((lambda x: list(x)))

In [None]:
data_group.dropna(inplace=True)
data_group.drop(['Sentence #'], axis=1, inplace=True)

In [None]:
data_group.info()

In [None]:
count = 0
for i in data_group['Word']:
    if len(i) > count:
        count = len(i)
print(f"maxlen {count}")

In [None]:
# 104 layer input XD

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

def PaddingTestTrainSplit(data_group, data):

    UniqToken = len(token2idx)
    UniqTag = len(tag2idx)
    tokens = data_group['WordIdx'].tolist()
    maxlen = 104
    paddded = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= UniqToken-1)

    #PAdding and OneHot Encoding
    tags = data_group['TagIdx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    
    pad_tags = [to_categorical(i, num_classes=UniqTag) for i in pad_tags]       #one hot encoding
    
    #Split test train
    tokens_, test_tokens, tags_, test_tags = train_test_split(paddded, pad_tags, test_size=0.11, train_size=0.89, random_state=48)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.11,train_size =0.89, random_state=50)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )
    
    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = PaddingTestTrainSplit(data_group, df)

In [None]:
import numpy as np
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model


In [None]:
# To find out which devices your operations and tensors are assigned to
tf.debugging.set_log_device_placement(True)

# Create some tensors and perform an operation
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)

print(c)

"""
2.3.1
Executing op MatMul in device /job:localhost/replica:0/task:0/device:CPU:0
tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)
"""

In [None]:
from numpy.random import seed
seed(1)
tf.random.set_seed(2)

In [None]:
def model_train():
    model = Sequential()
    model.add(Input(shape=(104,)))
    model.add(Embedding(input_dim=len(token2idx), output_dim=104, input_length=104))
    model.add(Dropout(0.1))
    model.add(Bidirectional(LSTM(units=104, return_sequences=True, recurrent_dropout=0.1)))
    model.add(TimeDistributed(Dense(len(tag2idx))))
    model.add(Dense(len(tag2idx), activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
input_dim = len(list(set(df['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

In [None]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    adam = tf.optimizers.Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [None]:
model = get_bilstm_lstm_model()

In [None]:

def train_model(X, y, model):
    loss = list()
    for i in range(5):
        # fit model for one epoch on this sequence
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
model_2 = model_train()

In [None]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
# model_mine = model_train()
plot_model(model_bilstm_lstm)
# plot_model(model_mine)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)
# results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_mine)

In [None]:
sentence = "London has such beautiful weather"
sentence = sentence.split()
sentence = [token2idx[w] for w in sentence]
sentence = np.array(sentence).reshape(1,-1)
UniqToken = len(token2idx)
UniqTag = len(tag2idx)
tokens = sentence.tolist()
print(tokens)
maxlen = 104
paddded = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= UniqToken-1)
predictions = model.predict(paddded)
predictions = np.argmax(predictions, axis=-1)
print(predictions)

In [None]:
print(predictions)
predictions = [idx2tag[e] for e in predictions[0]]
print(predictions)
