In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
from sklearn.utils import shuffle
from datetime import datetime
from sklearn.metrics import f1_score

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, GRU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam 

In [None]:
max_vocab_size = 20000
max_tags = 100

In [None]:
def get_data_pos(split_sequences = False):
    if not os.path.exists('../Data/chunking'):
        print("No data existed")
        exit()
    
    Xtrain = []
    Ytrain = []
    currentX = []
    currentY = []
    for line in open('../Data/chunking/train.txt'):
        line = line.rstrip()
        if line:
            r = line.split()
            word, tag, _ = r
            currentX.append(word)
            currentY.append(tag)
        
        elif split_sequences:
            Xtrain.append(currentX)
            Ytrain.append(currentY)
            currentX = []
            currentY = []
    
    if not split_sequences:
        Xtrain = currentX
        Ytrain = currentY
    
    Xtest = []
    Ytest = []
    currentX = []
    currentY = []
    for line in open('../Data/chunking/test.txt'):
        line = line.rstrip()
        if line:
            r = line.split()
            word, tag, _ = r 
            currentX.append(word)
            currentY.append(tag)
        elif split_sequences:
            Xtest.append(currentX)
            Ytest.append(currentY)
            currentX = []
            currentY = []
    
    if not split_sequences:
        Xtest = currentX
        Ytest = currentY
    
    return Xtrain, Ytrain, Xtest, Ytest

In [None]:
def get_data_ner(split_sequences = False):
    Xtrain = []
    Ytrain = []
    currentX = []
    currentY = []
    for line in open('ner.txt'):
        line = line.rstrip()
        if line:
            r = line.split()
            word, tag = r
            word = word.lower()
            currentX.append(word)
            currentY.append(tag)
        elif split_sequences:
            Xtrain.append(currentX)
            Ytrain.append(currentY)
            currentX = []
            currentY = []
    
    if not split_sequences:
        Xtrain = currentX
        Ytrain = currentY
    
    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    Ntest = int(0.3 * len(Xtrain))
    Xtest = Xtrain[:Ntest]
    Ytest = Ytrain[:Ntest]
    Xtrain = Xtrain[Ntest:]
    Ytrain = Ytrain[Ntest:]
    
    return Xtrain, Ytrain, Xtest, Ytest

In [None]:
# Xtrain, Ytrain, Xtest, Ytest = get_data_pos(split_sequences = True)
Xtrain, Ytrain, Xtest, Ytest = get_data_ner(split_sequences = True)

In [None]:
# 0 padding value has been considered into tokenizer
tokenizer = Tokenizer(num_words = max_vocab_size)
tokenizer.fit_on_texts(Xtrain)
Xtrain = tokenizer.texts_to_sequences(Xtrain)
Xtest = tokenizer.texts_to_sequences(Xtest)
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))
vocab_size = min(max_vocab_size, len(word2idx) + 1) # +1 for padding

In [None]:
tokenizer2 = Tokenizer(num_words = max_tags)
tokenizer2.fit_on_texts(Ytrain)
Ytrain = tokenizer2.texts_to_sequences(Ytrain)
Ytest = tokenizer2.texts_to_sequences(Ytest)
tag2idx = tokenizer2.word_index
print('Found %s unique tags.' % len(tag2idx))
num_tags = min(max_tags, len(tag2idx) + 1)

In [None]:
sequence_length = max(len(x) for x in Xtrain + Xtest)
Xtrain = pad_sequences(Xtrain, maxlen = sequence_length)
Ytrain = pad_sequences(Ytrain, maxlen = sequence_length)
Xtest = pad_sequences(Xtest, maxlen = sequence_length)
Ytest = pad_sequences(Ytest, maxlen = sequence_length)
print('Xtrain.shape:', Xtrain.shape)
print('Ytrain.shape:', Ytrain.shape)

In [None]:
Ytrain_onehot = np.zeros((len(Ytrain), sequence_length, num_tags), dtype = 'float32')
for n, sample in enumerate(Ytrain):
    for t, tag in enumerate(sample):
        Ytrain_onehot[n, t, tag] = 1

Ytest_onehot = np.zeros((len(Ytest), sequence_length, num_tags), dtype = 'float32')
for n, sample in enumerate(Ytest):
    for t, tag in enumerate(sample):
        Ytest_onehot[n, t, tag] = 1

In [None]:
epochs = 30
batch_size = 32
hidden_layer_size = 10
embedding_dim = 10

In [None]:
input_ = Input(shape = (sequence_length,))
x = Embedding(vocab_size, embedding_dim)(input_) # x: N x T x D
x = GRU(hidden_layer_size, return_sequences = True)(x) # N x T x M
output = Dense(num_tags, activation = 'softmax')(x) # N x T x K

In [None]:
model = Model(input_, output)
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = Adam(lr = 1e-2),
    metrics = ['accuracy']
)
r = model.fit(
    Xtrain,
    Ytrain_onehot,
    batch_size = batch_size,
    epochs = epochs,
    validation_data = (Xtest, Ytest_onehot)
)

In [None]:
plt.plot(r.history['loss'], label = 'loss')
plt.plot(r.history['val_loss'], label = 'val_loss')
plt.legend()
plt.show()

In [None]:
plt.plot(r.history['acc'], label = 'accuracy')
plt.plot(r.shitory['val_acc'], label = 'val_accuracy')
plt.legend()
plt.show()