In [None]:
import re
import string
import nltk
import pickle as pk
import gensim
import numpy as np
from nltk.stem import *
stemmer = PorterStemmer()
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
import pandas as pd
from sklearn.metrics import accuracy_score  
from xgboost import XGBClassifier
import data_helpers 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Embedding,Input,BatchNormalization,Dense,Bidirectional,LSTM,Dropout
from keras.callbacks import History ,ModelCheckpoint, EarlyStopping
%env CUDA_VISIBLE_DEVICES=0

## Load Data

In [None]:
def _shuffle(X, Y):
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

In [None]:
x_train_text, pos1_train, pos2_train, labels_train = data_helpers.load_data_and_labels('data/TRAIN_FILE.txt')
x_test_text, pos1_test, pos2_test, labels_test = data_helpers.load_data_and_labels('data/TEST_FILE_FULL.txt')
x_total = x_train_text + x_test_text

## Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=25000,lower=True,split=' ',char_level=False)
tokenizer.fit_on_texts(x_total)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
train_sentence_seq = tokenizer.texts_to_sequences(x_train_text)
test_sentence_seq = tokenizer.texts_to_sequences(x_test_text)

max_length = np.max([len(i) for i in train_sentence_seq+test_sentence_seq])
print("max length:", max_length)

x_train_seq = sequence.pad_sequences(train_sentence_seq, maxlen=max_length)
x_test_seq = sequence.pad_sequences(test_sentence_seq, maxlen=max_length)

## Build embedding_matrix

In [None]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
# download pre-trained word vector from "https://nlp.stanford.edu/projects/glove/"
tmp_file = get_tmpfile("/home/thtang/LifeLog/data/glove_pretrained/gensim_crawl_300d.txt")

w2vModel = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
# prepare embedding matrix
embedding_size = 300
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, embedding_size))
oov = 0
for word, i in word_index.items():
    if word in w2vModel.wv.vocab:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = w2vModel[word]
    else:
        oov+=1
print("OOV:",oov)

In [None]:
from keras.utils.generic_utils import get_custom_objects
from keras.layers import Embedding, Input,InputLayer,BatchNormalization, Dense, Bidirectional,LSTM,Dropout,GRU,Activation
from keras import backend as K
def swish(x):
    return (K.sigmoid(x) * x)
get_custom_objects().update({'swish': Activation(swish)})

def train_BiLSTM(x_train,y_train,x_val,y_val,embedding_matrix, max_length):
    max_features = 22434
    embedding_size = 300
    batch_size = 64
    epochs = 100
    embedding_layer = Embedding(max_features,output_dim= embedding_size,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)
    sequence_input = Input(shape=(max_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    lstm0 = Bidirectional(LSTM(256,activation="tanh",dropout=0.2,return_sequences = True,
                kernel_initializer='he_uniform'))(embedded_sequences)
    lstm1 = Bidirectional(LSTM(128,activation="tanh",dropout=0.2,return_sequences = True,
                kernel_initializer='he_uniform'))(lstm0)
    lstm2 = Bidirectional(LSTM(64,activation="tanh",dropout=0.2,return_sequences = False,
                kernel_initializer='he_uniform'))(lstm1)
    bn1 = BatchNormalization()(lstm2)
    dense1 = Dense(64, activation=swish)(bn1)
    dropout1 = Dropout(0.5)(dense1)
    dense2 = Dense(32, activation=swish)(dropout1)
    dropout2 = Dropout(0.5)(dense2)
    preds = Dense(19, activation='softmax')(dropout2)
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    filepath = "models/BiLSTM.hdf5" 
    checkpoint = ModelCheckpoint(filepath,monitor='val_acc',save_best_only=True)
    callbacks_list = [checkpoint]
    
    history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)
    
    scores = model.evaluate(x_train, y_train, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
train_X, train_y = _shuffle(x_train_seq, labels_train)
model = train_BiLSTM(train_X, train_y, x_test_seq ,labels_test ,
                         embedding_matrix,
                         max_length)

## Testing

In [None]:
y_test = [np.where(r==1)[0][0] for r in labels_test ]

model = load_model('models/BiLSTM.hdf5')

prediction = model.predict(x_test_seq, batch_size=1000)

pred_y = np.argmax(prediction,axis=1)

In [None]:
print("accuracy:",accuracy_score(pred_y.tolist(), y_test)) 

In [None]:
labelsMapping = {'Other': 0,
                 'Message-Topic(e1,e2)': 1, 'Message-Topic(e2,e1)': 2,
                 'Product-Producer(e1,e2)': 3, 'Product-Producer(e2,e1)': 4,
                 'Instrument-Agency(e1,e2)': 5, 'Instrument-Agency(e2,e1)': 6,
                 'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
                 'Cause-Effect(e1,e2)': 9, 'Cause-Effect(e2,e1)': 10,
                 'Component-Whole(e1,e2)': 11, 'Component-Whole(e2,e1)': 12,
                 'Entity-Origin(e1,e2)': 13, 'Entity-Origin(e2,e1)': 14,
                 'Member-Collection(e1,e2)': 15, 'Member-Collection(e2,e1)': 16,
                 'Content-Container(e1,e2)': 17, 'Content-Container(e2,e1)': 18}
labelsMapping_inv =  {v: k for k, v in labelsMapping.items()}

In [None]:
pred_y = [labelsMapping_inv[v] for v in pred_y]

test_id = list(range(8001,8001+len(pred_y)))

with open("proposed_answers.txt", "w") as f:
    for i in range(len(test_id)):
        f.write(str(test_id[i])+"\t"+pred_y[i])
        f.write("\n")

Then run the judgement