In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import keras
from keras.utils import to_categorical
import numpy as np
import os
import pickle as pkl

train_dict = pkl.load(open("/kaggle/input/4901k-project-data/train.pkl", "rb"))
val_dict = pkl.load(open("/kaggle/input/4901k-project-data/val.pkl", "rb"))
test_dict = pkl.load(open("/kaggle/input/4901k-project-data/test.pkl", "rb"))
print("keys in train_dict:", train_dict.keys())
print("keys in val_dict:", val_dict.keys())
print("keys in test_dict:", test_dict.keys())

In [None]:
import tensorflow as tf

In [None]:
print("index:", train_dict["id"][0])
print(*zip(train_dict["word_seq"][0], train_dict["tag_seq"][0]))

In [None]:
from itertools import chain
print("count of the NER tags:", len(set(chain(*train_dict["tag_seq"]))))
print("all the NER tags:", set(chain(*train_dict["tag_seq"])))

In [None]:
vocab_dict = {'_unk_': 0, '_w_pad_': 1}

for doc in train_dict['word_seq']:
    for word in doc:
        if(word not in vocab_dict):
            vocab_dict[word] = len(vocab_dict)

tag_dict = {'_t_pad_': 0} # add a padding token

for tag_seq in train_dict['tag_seq']:
    for tag in tag_seq:
        if(tag not in tag_dict):
            tag_dict[tag] = len(tag_dict)
word2idx = vocab_dict
idx2word = {v:k for k,v in word2idx.items()}
tag2idx = tag_dict
idx2tag = {v:k for k,v in tag2idx.items()}            

print("size of word vocab:", len(vocab_dict), "size of tag_dict:", len(tag_dict))

In [None]:
max_sent_length = 128

train_tokens = np.array([[word2idx[w] for w in doc] for doc in train_dict['word_seq']])
val_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in val_dict['word_seq']])
test_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in test_dict['word_seq']])


train_tags = [[tag2idx[t] for t in t_seq] for t_seq in train_dict['tag_seq']]
train_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in train_tags])

val_tags = [[tag2idx[t] for t in t_seq] for t_seq in val_dict['tag_seq']]
val_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in val_tags])


In [None]:
print("training size:", train_tokens.shape, "tag size:", train_tags.shape)
print("validating size:", val_tokens.shape, "tag size:", val_tags.shape)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Input, Add, Concatenate,\
    Bidirectional, SimpleRNN, LSTM, GRU, TimeDistributed, SpatialDropout1D

In [None]:
num_training_data = train_tokens.shape[0]
sequence_length = train_tokens.shape[1]
vocabulary_size = len(vocab_dict)
num_tags = train_tags.shape[2]

In [None]:
# training parameters
drop = 0.3
epochs = 40
batch_size = 150
embedding_dim = 20

# lstm parameters
hidden_size = 30

def build_RNN(model_type):
    model = Sequential()
    model.add(Input(shape=(sequence_length,), dtype='int32'))
    model.add(Embedding(input_dim=vocabulary_size, 
                        output_dim=embedding_dim, 
                        input_length=sequence_length))
    model.add(Dropout(drop))
    if model_type == "lstm":
        model.add(Bidirectional(LSTM(units=hidden_size,return_sequences=True)))
    elif model_type == "gru":
        model.add(Bidirectional(GRU(units=hidden_size,return_sequences=True)))
    else:
        model.add(Bidirectional(SimpleRNN(units=hidden_size,return_sequences=True)))
    model.add(BatchNormalization())
    model.add(TimeDistributed(Dense(units=num_tags,
        activation='softmax')))
    return model

In [None]:
lstm = build_RNN("lstm")
gru = build_RNN("gru")
#rnn = build_RNN("rnn")

adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)
lstm.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])
gru.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])
#rnn.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])

print(lstm.summary)
print(gru.summary)
#print(rnn.summary)

In [None]:
print("Training Model...")
history = lstm.fit(
        train_tokens, 
        train_tags, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1)
print("Finish!")

In [None]:
drop = 0.3
epochs = 15
batch_size = 150
embedding_dim = 60

# lstm parameters
hidden_size = 120
second_hidden = 120

def build_rnn(model_type):
    model = Sequential(name = model_type)
    model.add(Input(shape=(sequence_length,), dtype='int32'))
    model.add(Embedding(input_dim=vocabulary_size, 
                        output_dim=embedding_dim, 
                        input_length=sequence_length))
    model.add(SpatialDropout1D(drop))
    if model_type == 'double-gru':
        model.add(Bidirectional(GRU(units=hidden_size,return_sequences=True, recurrent_dropout = 0.1)))
        model.add(GRU(units=second_hidden, return_sequences = True, recurrent_dropout = 0.1))
    elif model_type == 'double-lstm':
        model.add(Bidirectional(LSTM(units=hidden_size,return_sequences=True, recurrent_dropout = 0.1)))
        model.add(LSTM(units=second_hidden, return_sequences = True, recurrent_dropout = 0.1))
    elif model_type == 'lstm-gru':
        model.add(Bidirectional(LSTM(units=hidden_size,return_sequences=True, recurrent_dropout = 0.1)))
        model.add(GRU(units=second_hidden, return_sequences = True, recurrent_dropout = 0.1))
    elif model_type == 'gru-lstm':
        model.add(Bidirectional(GRU(units=hidden_size,return_sequences=True, recurrent_dropout = 0.1)))
        model.add(LSTM(units=second_hidden, return_sequences = True, recurrent_dropout = 0.1))
    elif model_type == 'triple-lstm':
        model.add(Bidirectional(LSTM(units=hidden_size,return_sequences=True, recurrent_dropout = 0.1)))
        model.add(LSTM(units=second_hidden, return_sequences = True, recurrent_dropout = 0.1))
        model.add(LSTM(units=second_hidden, return_sequences = True, recurrent_dropout = 0.1))
    model.add(BatchNormalization())
    model.add(TimeDistributed(Dense(units=num_tags,
        activation='softmax')))
    return model

In [None]:
double_lstm = build_rnn('double-lstm')
adam = keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999)
double_lstm.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])
print(double_lstm.summary())

In [None]:
double_gru = build_rnn('double-gru')
adam = keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999)
double_gru.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])
print(double_gru.summary())

In [None]:
lstm_gru = build_rnn('lstm-gru')
adam = keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999)
lstm_gru.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])
print(lstm_gru.summary())

In [None]:
gru_lstm = build_rnn('gru-lstm')
adam = keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999)
gru_lstm.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])
print(gru_lstm.summary())

In [None]:
triple_lstm = build_rnn('triple-lstm')
initial_learning_rate = 0.01
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=300,
    decay_rate=0.9,
    staircase=True)
adam = tf.keras.optimizers.Adam(learning_rate=lr_schedule, beta_1=0.9, beta_2=0.999)
triple_lstm.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["categorical_accuracy"])
print(triple_lstm.summary())

In [None]:
val_tags_by_idx = np.argmax(val_tags, axis=2)
val_labels = np.array([[idx2tag[p] for p in preds] for preds in val_tags_by_idx])

In [None]:
def calc_accuracy(preds, tags, padding_id="_t_pad_"):
    """
        Input:
            preds (np.narray): (num_data, length_sentence)
            tags  (np.narray): (num_data, length_sentence)
        Output:
            Proportion of correct prediction. The padding tokens are filtered out.
    """
    preds_flatten = preds.flatten()
    tags_flatten = tags.flatten()
    non_padding_idx = np.where(tags_flatten!=padding_id)[0]
    
    return sum(preds_flatten[non_padding_idx]==tags_flatten[non_padding_idx])/len(non_padding_idx)

In [None]:
class valScore(keras.callbacks.Callback):
    def __init__(self, data):
        self.x = data
    def on_epoch_end(self, epoch, logs={}):
        preds = self.model.predict(self.x, batch_size = 150, verbose = 1)
        preds_tags_by_idx = np.argmax(preds, axis=2)
        preds_labels = np.array([[idx2tag[p] for p in pred] for pred in preds_tags_by_idx])
        c = calc_accuracy(preds_labels, val_labels)
        print("validation accuracy:", c)

In [None]:
val_score = valScore(val_tokens)

In [None]:
print("Training Model...")
history = double_gru.fit(
        train_tokens, 
        train_tags, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1, validation_data=(val_tokens,val_tags), callbacks=[val_score])
print("Finish!")

In [None]:
print("Training Model...")
history = double_lstm.fit(
        train_tokens, 
        train_tags, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1, validation_data=(val_tokens,val_tags),callbacks=[val_score])
print("Finish!")

In [None]:
print("Training Model...")
history = lstm_gru.fit(
        train_tokens, 
        train_tags, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1, validation_data=(val_tokens,val_tags), callbacks=[val_score])
print("Finish!")

In [None]:
print("Training Model...")
history = gru_lstm.fit(
        train_tokens, 
        train_tags, 
        batch_size=batch_size, 
        epochs=1,
        verbose=1, validation_data=(val_tokens,val_tags), callbacks=[val_score])
print("Finish!")

In [None]:
print("Training Model...")
history = triple_lstm.fit(
        train_tokens, 
        train_tags, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1, validation_data=(val_tokens,val_tags),callbacks=[val_score])
print("Finish!")

In [None]:
double_gru.save('/kaggle/working/double_gru_better.h5')

In [None]:
double_lstm.save('/kaggle/working/double_lstm_better.h5')

In [None]:
lstm_gru.save('/kaggle/working/lstm_gru_better.h5')

In [None]:
gru_lstm.save('/kaggle/working/gru_lstm_better.h5')

In [None]:
triple_lstm.save('/kaggle/working/triple_lstm_better.h5')

In [None]:
double_gru = keras.models.load_model('/kaggle/input/4901k-project/double_gru_better.h5')
double_gru.summary()

In [None]:
double_lstm = keras.models.load_model('/kaggle/input/4901k-project/double_lstm_better.h5')
double_lstm.summary()

In [None]:
lstm_gru = keras.models.load_model('/kaggle/input/4901k-project/lstm_gru_better.h5')
lstm_gru.summary()

In [None]:
gru_lstm = keras.models.load_model('/kaggle/input/4901k-project/gru_lstm_better.h5')
gru_lstm.summary()

In [None]:
triple_lstm = keras.models.load_model('/kaggle/input/4901k-project/triple_lstm_better.h5')
triple_lstm.summary()

In [None]:
preds =(double_gru.predict(val_tokens))
preds_tags_by_idx = np.argmax(preds, axis=2)
preds_labels = np.array([[idx2tag[p] for p in pred] for pred in preds_tags_by_idx])

In [None]:
c=calc_accuracy(preds_labels, val_labels)
print('double_gru', c)

In [None]:
preds =(double_lstm.predict(val_tokens))
preds_tags_by_idx = np.argmax(preds, axis=2)
preds_labels = np.array([[idx2tag[p] for p in pred] for pred in preds_tags_by_idx])

In [None]:
c = calc_accuracy(preds_labels, val_labels)
print('double_lstm', c)

In [None]:
preds =(lstm_gru.predict(val_tokens))
preds_tags_by_idx = np.argmax(preds, axis=2)
preds_labels = np.array([[idx2tag[p] for p in pred] for pred in preds_tags_by_idx])

In [None]:
c = calc_accuracy(preds_labels, val_labels)
print('lstm_gru', c)

In [None]:
preds =(gru_lstm.predict(val_tokens))
preds_tags_by_idx = np.argmax(preds, axis=2)
preds_labels = np.array([[idx2tag[p] for p in pred] for pred in preds_tags_by_idx])

In [None]:
c = calc_accuracy(preds_labels, val_labels)
print('gru_lstm', c)

In [None]:
preds =(triple_lstm.predict(val_tokens))
preds_tags_by_idx = np.argmax(preds, axis=2)
preds_labels = np.array([[idx2tag[p] for p in pred] for pred in preds_tags_by_idx])

In [None]:
c = calc_accuracy(preds_labels, val_labels)
print('triple_lstm', c)

In [None]:
models = []
models.append(double_gru)
#models.append(lstm_gru)
models.append(double_lstm)
models.append(gru_lstm)
models.append(gru_lstm)

In [None]:
model_input = Input(shape=(sequence_length,))

In [None]:
def ensembleModels(models, model_input):
    yModels=[model(model_input) for model in models] 
    yAvg=keras.layers.average(yModels) 
    modelEns = Model(inputs=model_input, outputs=yAvg, name='ensemble')  
   
    return modelEns

In [None]:
ens = ensembleModels(models, model_input)
ens.summary()

In [None]:
preds =(ens.predict(val_tokens))
preds_tags_by_idx = np.argmax(preds, axis=2)
preds_labels = np.array([[idx2tag[p] for p in pred] for pred in preds_tags_by_idx])

In [None]:
c = calc_accuracy(preds_labels, val_labels)
print('ensemble', c)

In [None]:
test_preds_numerical = ens.predict(test_tokens)
test_preds_tags_by_idx = np.argmax(test_preds_numerical, axis=2)
test_preds = np.array([[idx2tag[p] for p in preds] for preds in test_preds_tags_by_idx])
print(test_preds_numerical.shape)
print(test_preds[0])

In [None]:
import json
import pandas as pd

df = pd.DataFrame({'id': test_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in test_preds]})
df.to_csv('./test_preds.csv', index=False)

In [None]:
pd.read_csv("test_preds.csv")

In [None]:
val_preds_numerical = ens.predict(val_tokens)
val_preds_tags_by_idx = np.argmax(val_preds_numerical, axis=2)
val_preds = np.array([[idx2tag[p] for p in preds] for preds in val_preds_tags_by_idx])
print(val_preds_numerical.shape)
print(val_preds[0])

In [None]:
import json
import pandas as pd

df = pd.DataFrame({'id': val_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in val_preds]})
df.to_csv('./val_preds.csv', index=False)

In [None]:
def evaluate(pred_file, ground_file):
    file_dict = pkl.load(open(ground_file, "rb"))
    file_preds = pd.read_csv(pred_file)
    return calc_accuracy(np.array([json.loads(line) for line in file_preds["labels"]]), 
              np.array(file_dict["tag_seq"]))

In [None]:
import json
import pandas as pd

df = pd.DataFrame({'id': val_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in val_preds]})
df.to_csv('val_preds.csv', index=False)

print("val accuracy", evaluate('val_preds.csv', "/kaggle/input/4901k-project-data/val.pkl"))