In [2]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torch.nn import Module
from sklearn import metrics
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from gensim.models import KeyedVectors
from underthesea import word_tokenize, sent_tokenize
from keras.preprocessing.sequence import pad_sequences
from pandas.io.json import json_normalize
from underthesea import word_tokenize, sent_tokenize
from operator import itemgetter

import warnings
warnings.filterwarnings('ignore')
from IPython.core.debugger import set_trace

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Load Data

In [3]:
train_file = '/data/ai_challenge/vietnameseqa/data/train.json'
test_file = '/data/ai_challenge/vietnameseqa/data/test.json'
embedding_file = '/data/ai_challenge/vietnameseqa/embedding/wiki.vi.model.bin.gz'

In [34]:
quest_max_len = 20
para_max_len = 60
title_max_len = 10
max_words = 60000

In [35]:
class VNTokenizer:
    def __init__(self, num_words):
        super(VNTokenizer, self).__init__()
        self.num_words = num_words
        self.word_counts = {}
        self.word_index = {}

    def fit_on_texts(self, texts):
        for text in texts:
            tokens = word_tokenize(text)

            for token in tokens:
                if token not in self.word_counts:
                    self.word_counts[token] = 1
                else:
                    self.word_counts[token] += 1
            
        for i, (token, count) in enumerate(sorted(self.word_counts.items(), key=itemgetter(1), reverse=True)[ : self.num_words]):
            self.word_index[token] = i + 1

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            tokens = word_tokenize(text)
            sequence = []
            for token in tokens:
                sequence.append(self.word_index[token])

            sequences.append(sequence)

        return sequences

In [37]:
def load_data(train_file, test_file):
    train_df = pd.read_json(train_file, encoding= 'utf-8')

    with open(test_file) as json_file:
        test_json = json.load(json_file)
    test_df = json_normalize(test_json, 'paragraphs', ['__id__', 'question', 'title'])
    print('Train data:', train_df.shape)
    print('Test data:', test_df.shape)
    train_quests = train_df['question'].values
    train_paras = train_df['text'].values
    train_y = train_df['label'].astype(float).values

    test_quest_ids = test_df['__id__'].values
    test_quests = test_df['question'].values
    test_para_ids = test_df['id'].values
    test_paras = test_df['text'].values

    tokenizer = VNTokenizer(max_words)
    tokenizer.fit_on_texts(list(train_quests) + list(train_paras) + list(test_quests) + list(test_paras))
    train_quests = tokenizer.texts_to_sequences(train_quests)
    train_paras = tokenizer.texts_to_sequences(train_paras)
    test_quests = tokenizer.texts_to_sequences(test_quests)
    test_paras = tokenizer.texts_to_sequences(test_paras)

    train_quests = pad_sequences(train_quests, maxlen=quest_max_len)
    train_paras = pad_sequences(train_paras, maxlen=para_max_len)
    test_quests = pad_sequences(test_quests, maxlen=quest_max_len)
    test_paras = pad_sequences(test_paras, maxlen=para_max_len)

    np.random.seed(2019)
    trn_idx = np.random.permutation(len(train_quests))

    train_quests = train_quests[trn_idx]
    train_paras = train_paras[trn_idx]
    train_y = train_y[trn_idx]

    return train_quests, train_paras, train_y, test_quest_ids, test_quests, test_para_ids, test_paras, tokenizer.word_index


def create_embed_matrix(word_index, embedding_file):
    word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
    nb_words = min(max_words, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, word_vectors.vector_size))

    for word, i in word_index.items():
        if i < nb_words:
            word = word.replace(' ', '_')
            if word in word_vectors:
                embedding_matrix[i] = word_vectors[word]
            elif word.lower() in word_vectors:
                embedding_matrix[i] = word_vectors[word.lower()]
            elif word.capitalize() in word_vectors:
                embedding_matrix[i] = word_vectors[word.capitalize()]
            elif word.upper() in word_vectors:
                embedding_matrix[i] = word_vectors[word.upper()]

    return embedding_matrix

In [38]:
def search_threshold(y_true, preds):
    best_thresh = 0.5
    best_score = 0.0
    for thresh in np.arange(0.3, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score = metrics.f1_score(y_true, (preds > thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
            best_score = score
    return best_thresh, best_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

## Model

In [39]:
class Net(Module):
    def __init__(self, embedding_matrix, hidden_dim_1, hidden_dim_2, target_size):
        super(Net, self).__init__()
        input_dim = embedding_matrix.shape[1]

        weights = torch.tensor(embedding_matrix, dtype=torch.float32)
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.embedding.weight.requires_grad = False

        self.lstm_ques = nn.LSTM(input_dim, hidden_dim_1, num_layers=1, batch_first=True)
        self.lstm_para = nn.LSTM(input_dim, hidden_dim_1, num_layers=1, batch_first=True)

        self.drop_ques = nn.Dropout(p=0.5)
        self.drop_para = nn.Dropout(p=0.5)
        self.drop_output = nn.Dropout(p=0.5)

        self.linear_1 = nn.Linear(hidden_dim_1 + hidden_dim_1, hidden_dim_2)
        self.linear_2 = nn.Linear(hidden_dim_2, target_size)

    def forward(self, input_):
        question, text = input_
        ques_embed = self.embedding(question)
        ques_output, ques_hidden = self.lstm_ques(ques_embed)

        para_embed = self.embedding(text)
        para_output, para_hidden = self.lstm_para(para_embed)

        ques_hidden = torch.squeeze(ques_hidden[0], 0)
        para_hidden = torch.squeeze(para_hidden[0], 0)

        ques_hidden = self.drop_ques(ques_hidden)
        para_hidden = self.drop_para(para_hidden)

        compare_vec = torch.cat([ques_hidden, para_hidden], dim=1)
        output_1 = self.linear_1(compare_vec)
        output_1 = self.drop_output(output_1)
        output_2 = self.linear_2(output_1)

        return output_2

## Training

In [None]:
n_epoches = 5
hidden_dim_1 = 200
hidden_dim_2 = 200
hidden_dim_3 = 400
target_size = 1
batch_size = 32
threshold = 0.5

In [41]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', device)

train_quests, train_paras, train_y, test_quest_ids, test_quests, test_para_ids, test_paras, word_index = load_data(train_file, test_file)
embedding_matrix = create_embed_matrix(word_index, embedding_file)
print('Embedding matrix:', embedding_matrix.shape)

test_quests = torch.tensor(test_quests, dtype=torch.long).to(device)
test_paras = torch.tensor(test_paras, dtype=torch.long).to(device)
test_dataset = TensorDataset(test_quests, test_paras)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=2019).split(train_quests, train_y))
valid_preds = np.zeros(train_y.shape)
test_preds = []
for idx, (train_idx, valid_idx) in enumerate(splits):
    print('\nTrain Fold {}'.format(idx))

    train_model = Net(embedding_matrix, hidden_dim_1, hidden_dim_2, target_size)
    train_model.to(device)

    loss_fn = torch.nn.BCEWithLogitsLoss()
    # optimizer = torch.optim.SGD(train_model.parameters(), lr=0.1)
    optimizer = torch.optim.Adam(train_model.parameters())

    quests_train = torch.tensor(train_quests[train_idx], dtype=torch.long).to(device)
    quests_valid = torch.tensor(train_quests[valid_idx], dtype=torch.long).to(device)
    paras_train = torch.tensor(train_paras[train_idx], dtype=torch.long).to(device)
    paras_valid = torch.tensor(train_paras[valid_idx], dtype=torch.long).to(device)
    y_train = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).to(device)
    y_valid = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).to(device)

    train_dataset = TensorDataset(quests_train, paras_train, y_train)
    valid_dataset = TensorDataset(quests_valid, paras_valid, y_valid)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(n_epoches):
        train_model.train()
        train_loss = 0

        for i_batch, (quests_train, paras_train, y_train) in enumerate(train_loader):
            y_pred = train_model((quests_train, paras_train))

            loss = loss_fn(y_pred, y_train)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * quests_train.shape[0]

        train_loss = train_loss / len(train_idx)

        train_model.eval()
        valid_loss = 0
        fold_val_preds = np.zeros(len(valid_idx))
        with torch.no_grad():
            for i_batch, (quests_valid, paras_valid, y_valid) in enumerate(valid_loader):
                y_pred = train_model((quests_valid, paras_valid)).detach()
                valid_loss += loss_fn(y_pred, y_valid).item() * quests_valid.shape[0]
                fold_val_preds[i_batch * batch_size : (i_batch + 1) * batch_size] = sigmoid(y_pred.cpu().numpy().squeeze())

        valid_loss = valid_loss / len(valid_idx)
        valid_preds[valid_idx] = fold_val_preds

        print("Epoch: {} - train_loss: {:.4f} - valid_loss: {:.4f}".format(epoch, train_loss, valid_loss))

    train_model.eval()
    fold_test_preds = np.zeros(len(test_quest_ids))
    with torch.no_grad():
        for i_batch, (test_quests, test_paras) in enumerate(test_loader):
            y_pred = train_model((test_quests, test_paras)).detach()
            fold_test_preds[i_batch * batch_size : (i_batch + 1) * batch_size] = sigmoid(y_pred.cpu().numpy().squeeze())

    test_preds.append(fold_test_preds)

# Output
threshold, val_score = search_threshold(train_y, valid_preds)
print('Valid score: {} - Best threshold: {}'.format(val_score, threshold))
preds = np.mean(test_preds, axis=0)
y_output = (preds > threshold).astype(int)

submit_df = pd.DataFrame()
submit_df['test_id'] = test_quest_ids[y_output==1]
submit_df['answer'] = test_para_ids[y_output==1]
submit_df.to_csv('../submits/sample_submission_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '.csv', index=False)

Device: cuda:0
Train data: (18108, 5)
Test data: (2678, 5)
Embedding matrix: (54325, 400)

Train Fold 0
Epoch: 0 - train_loss: 0.6284 - valid_loss: 0.6116
Epoch: 1 - train_loss: 0.6095 - valid_loss: 0.6164
Epoch: 2 - train_loss: 0.5708 - valid_loss: 0.5939
Epoch: 3 - train_loss: 0.5160 - valid_loss: 0.6186
Epoch: 4 - train_loss: 0.4428 - valid_loss: 0.6428
Epoch: 5 - train_loss: 0.3709 - valid_loss: 0.6717
Epoch: 6 - train_loss: 0.2953 - valid_loss: 0.8689
Epoch: 7 - train_loss: 0.2407 - valid_loss: 0.9884
Epoch: 8 - train_loss: 0.1953 - valid_loss: 1.1075
Epoch: 9 - train_loss: 0.1604 - valid_loss: 1.0507
Epoch: 10 - train_loss: 0.1315 - valid_loss: 1.3517
Epoch: 11 - train_loss: 0.1174 - valid_loss: 1.3854
Epoch: 12 - train_loss: 0.1070 - valid_loss: 1.5658
Epoch: 13 - train_loss: 0.1008 - valid_loss: 1.4183
Epoch: 14 - train_loss: 0.0822 - valid_loss: 1.6388
Epoch: 15 - train_loss: 0.0914 - valid_loss: 1.5885
Epoch: 16 - train_loss: 0.0851 - valid_loss: 1.8697
Epoch: 17 - train_loss

KeyboardInterrupt: 