In [2]:
import re
import os
import sys
import random
import string
import logging
import argparse
from shutil import copyfile
from datetime import datetime
from collections import Counter
import torch
import msgpack
from drqa.model import DocReaderModel
from drqa.utils import str2bool

parser = argparse.ArgumentParser(
    description='Train a Document Reader model.'
)
# system
parser.add_argument('--log_file', default='output.log',
                    help='path for log file.')
parser.add_argument('--log_per_updates', type=int, default=3,
                    help='log model loss per x updates (mini-batches).')
parser.add_argument('--data_file', default='SQuAD/data.msgpack',
                    help='path to preprocessed data file.')
parser.add_argument('--model_dir', default='models',
                    help='path to store saved models.')
parser.add_argument('--save_last_only', action='store_true',
                    help='only save the final models.')
parser.add_argument('--eval_per_epoch', type=int, default=1,
                    help='perform evaluation per x epochs.')
parser.add_argument('--seed', type=int, default=1013,
                    help='random seed for data shuffling, dropout, etc.')
parser.add_argument("--cuda", type=str2bool, nargs='?',
                    const=True, default=torch.cuda.is_available(),
                    help='whether to use GPU acceleration.')
# training
parser.add_argument('-e', '--epochs', type=int, default=40)
parser.add_argument('-bs', '--batch_size', type=int, default=32)
parser.add_argument('-rs', '--resume', default='',
                    help='previous model file name (in `model_dir`). '
                         'e.g. "checkpoint_epoch_11.pt"')
parser.add_argument('-ro', '--resume_options', action='store_true',
                    help='use previous model options, ignore the cli and defaults.')
parser.add_argument('-rlr', '--reduce_lr', type=float, default=0.,
                    help='reduce initial (resumed) learning rate by this factor.')
parser.add_argument('-op', '--optimizer', default='adamax',
                    help='supported optimizer: adamax, sgd')
parser.add_argument('-gc', '--grad_clipping', type=float, default=10)
parser.add_argument('-wd', '--weight_decay', type=float, default=0)
parser.add_argument('-lr', '--learning_rate', type=float, default=0.1,
                    help='only applied to SGD.')
parser.add_argument('-mm', '--momentum', type=float, default=0,
                    help='only applied to SGD.')
parser.add_argument('-tp', '--tune_partial', type=int, default=1000,
                    help='finetune top-x embeddings.')
parser.add_argument('--fix_embeddings', action='store_true',
                    help='if true, `tune_partial` will be ignored.')
parser.add_argument('--rnn_padding', action='store_true',
                    help='perform rnn padding (much slower but more accurate).')
# model
parser.add_argument('--question_merge', default='self_attn')
parser.add_argument('--doc_layers', type=int, default=3)
parser.add_argument('--question_layers', type=int, default=3)
parser.add_argument('--hidden_size', type=int, default=128)
parser.add_argument('--num_features', type=int, default=4)
parser.add_argument('--pos', type=str2bool, nargs='?', const=True, default=True,
                    help='use pos tags as a feature.')
parser.add_argument('--ner', type=str2bool, nargs='?', const=True, default=True,
                    help='use named entity tags as a feature.')
parser.add_argument('--use_qemb', type=str2bool, nargs='?', const=True, default=True)
parser.add_argument('--concat_rnn_layers', type=str2bool, nargs='?',
                    const=True, default=True)
parser.add_argument('--dropout_emb', type=float, default=0.4)
parser.add_argument('--dropout_rnn', type=float, default=0.4)
parser.add_argument('--dropout_rnn_output', type=str2bool, nargs='?',
                    const=True, default=True)
parser.add_argument('--max_len', type=int, default=15)
parser.add_argument('--rnn_type', default='lstm',
                    help='supported types: rnn, gru, lstm')

args, unknown = parser.parse_known_args()

# set model dir
model_dir = args.model_dir
os.makedirs(model_dir, exist_ok=True)
model_dir = os.path.abspath(model_dir)

# set random seed
random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# setup logger
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
fh = logging.FileHandler(args.log_file)
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
log.addHandler(fh)
log.addHandler(ch)


def main():
    log.info('[program starts.]')
    train, dev, dev_y, embedding, opt = load_data(vars(args))
    log.info(opt)
    log.info('[Data loaded.]')

    if args.resume:
        log.info('[loading previous model...]')
        checkpoint = torch.load(os.path.join(model_dir, args.resume))
        if args.resume_options:
            opt = checkpoint['config']
        state_dict = checkpoint['state_dict']
        model = DocReaderModel(opt, embedding, state_dict)
        epoch_0 = checkpoint['epoch'] + 1
        for i in range(checkpoint['epoch']):
            random.shuffle(list(range(len(train))))  # synchronize random seed
        if args.reduce_lr:
            lr_decay(model.optimizer, lr_decay=args.reduce_lr)
    else:
        model = DocReaderModel(opt, embedding)
        epoch_0 = 1

    if args.cuda:
        model.cuda()

    if args.resume:
        batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
        predictions = []
        for batch in batches:
            predictions.extend(model.predict(batch))
        em, f1 = score(predictions, dev_y)
        log.info("[dev EM: {} F1: {}]".format(em, f1))
        best_val_score = em
    else:
        best_val_score = 0.0

    for epoch in range(epoch_0, epoch_0 + args.epochs):
        log.warning('Epoch {}'.format(epoch))
        # train
        batches = BatchGen(train, batch_size=args.batch_size, gpu=args.cuda)
        start = datetime.now()
        for i, batch in enumerate(batches):
            model.update(batch)
            if i % args.log_per_updates == 0:
                log.info('epoch [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(
                    epoch, model.updates, model.train_loss.avg,
                    str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0]))
        # eval
        if epoch % args.eval_per_epoch == 0:
            batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
            predictions = []
            for batch in batches:
                predictions.extend(model.predict(batch))
            em, f1 = score(predictions, dev_y)
            log.warning("dev EM: {} F1: {}".format(em, f1))
        # save
        if not args.save_last_only or epoch == epoch_0 + args.epochs - 1:
            model_file = os.path.join(model_dir, 'checkpoint_epoch_{}.pt'.format(epoch))
            model.save(model_file, epoch)
            if em > best_val_score:
                best_val_score = em
                copyfile(
                    model_file,
                    os.path.join(model_dir, 'best_model.pt'))
                log.info('[neGw best model saved.]')


def lr_decay(optimizer, lr_decay):
    for param_group in optimizer.param_groups:
        param_group['lr'] *= lr_decay
    log.info('[learning rate reduced by {}]'.format(lr_decay))
    return optimizer


def load_data(opt):
    with open('SQuAD/meta.msgpack', 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    opt['pretrained_words'] = True
    opt['vocab_size'] = embedding.size(0)
    opt['embedding_dim'] = embedding.size(1)
    opt['pos_size'] = len(meta['vocab_tag'])
    opt['ner_size'] = len(meta['vocab_ent'])
    with open(args.data_file, 'rb') as f:
        data = msgpack.load(f, encoding='utf8')
    train = data['train']
    data['dev'].sort(key=lambda x: len(x[1]))
    dev = [x[:-1] for x in data['dev']]
    dev_y = [x[-1] for x in data['dev']]
    return train, dev, dev_y, embedding, opt


class BatchGen:
    def __init__(self, data, batch_size, gpu, evaluation=False):
        """
        input:
            data - list of lists
            batch_size - int
        """
        self.batch_size = batch_size
        self.eval = evaluation
        self.gpu = gpu

        # shuffle
        if not evaluation:
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]
        # chunk into batches
        data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
        self.data = data

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        for batch in self.data:
            batch_size = len(batch)
            batch = list(zip(*batch))
            if self.eval:
                assert len(batch) == 8
            else:
                assert len(batch) == 10

            context_len = max(len(x) for x in batch[1])
            context_id = torch.LongTensor(batch_size, context_len).fill_(0)
            for i, doc in enumerate(batch[1]):
                context_id[i, :len(doc)] = torch.LongTensor(doc)

            feature_len = len(batch[2][0][0])

            context_feature = torch.Tensor(batch_size, context_len, feature_len).fill_(0)
            for i, doc in enumerate(batch[2]):
                for j, feature in enumerate(doc):
                    context_feature[i, j, :] = torch.Tensor(feature)

            context_tag = torch.Tensor(batch_size, context_len, args.pos_size).fill_(0)
            for i, doc in enumerate(batch[3]):
                for j, tag in enumerate(doc):
                    context_tag[i, j, tag] = 1

            context_ent = torch.Tensor(batch_size, context_len, args.ner_size).fill_(0)
            for i, doc in enumerate(batch[4]):
                for j, ent in enumerate(doc):
                    context_ent[i, j, ent] = 1

            question_len = max(len(x) for x in batch[5])
            question_id = torch.LongTensor(batch_size, question_len).fill_(0)
            for i, doc in enumerate(batch[5]):
                question_id[i, :len(doc)] = torch.LongTensor(doc)

            context_mask = torch.eq(context_id, 0)
            question_mask = torch.eq(question_id, 0)
            text = list(batch[6])
            span = list(batch[7])
            if not self.eval:
                y_s = torch.LongTensor(batch[8])
                y_e = torch.LongTensor(batch[9])
            if self.gpu:
                context_id = context_id.pin_memory()
                context_feature = context_feature.pin_memory()
                context_tag = context_tag.pin_memory()
                context_ent = context_ent.pin_memory()
                context_mask = context_mask.pin_memory()
                question_id = question_id.pin_memory()
                question_mask = question_mask.pin_memory()
            if self.eval:
                yield (context_id, context_feature, context_tag, context_ent, context_mask,
                       question_id, question_mask, text, span)
            else:
                yield (context_id, context_feature, context_tag, context_ent, context_mask,
                       question_id, question_mask, y_s, y_e, text, span)


def _normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def _exact_match(pred, answers):
    if pred is None or answers is None:
        return False
    pred = _normalize_answer(pred)
    for a in answers:
        if pred == _normalize_answer(a):
            return True
    return False


def _f1_score(pred, answers):
    def _score(g_tokens, a_tokens):
        common = Counter(g_tokens) & Counter(a_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0
        precision = 1. * num_same / len(g_tokens)
        recall = 1. * num_same / len(a_tokens)
        f1 = (2 * precision * recall) / (precision + recall)
        return f1

    if pred is None or answers is None:
        return 0
    g_tokens = _normalize_answer(pred).split()
    scores = [_score(g_tokens, _normalize_answer(a).split()) for a in answers]
    return max(scores)


def score(pred, truth):
    assert len(pred) == len(truth)
    f1 = em = total = 0
    for p, t in zip(pred, truth):
        total += 1
        em += _exact_match(p, t)
        f1 += _f1_score(p, t)
    em = 100. * em / total
    f1 = 100. * f1 / total
    return em, f1



In [3]:

log.info('[program starts.]')
train, dev, dev_y, embedding, opt = load_data(vars(args))
log.info(opt)
log.info('[Data loaded.]')


11/10/2017 03:15:51 [program starts.]
11/10/2017 03:16:22 {'dropout_emb': 0.4, 'dropout_rnn': 0.4, 'grad_clipping': 10, 'pos_size': 51, 'log_file': 'output.log', 'rnn_padding': False, 'vocab_size': 91555, 'weight_decay': 0, 'embedding_dim': 300, 'eval_per_epoch': 1, 'resume': '', 'reduce_lr': 0.0, 'learning_rate': 0.1, 'fix_embeddings': False, 'concat_rnn_layers': True, 'resume_options': False, 'hidden_size': 128, 'save_last_only': False, 'log_per_updates': 3, 'epochs': 40, 'ner_size': 19, 'pos': True, 'batch_size': 32, 'cuda': True, 'question_layers': 3, 'rnn_type': 'lstm', 'pretrained_words': True, 'dropout_rnn_output': True, 'optimizer': 'adamax', 'ner': True, 'use_qemb': True, 'num_features': 4, 'seed': 1013, 'tune_partial': 1000, 'question_merge': 'self_attn', 'max_len': 15, 'momentum': 0, 'doc_layers': 3, 'model_dir': 'models', 'data_file': 'SQuAD/data.msgpack'}
11/10/2017 03:16:22 [Data loaded.]


In [129]:

checkpoint = torch.load('/home/zawlin/g/DrQA2/models_e80_bs128/best_model.pt')
if args.resume_options:
    opt = checkpoint['config']
state_dict = checkpoint['state_dict']
model = DocReaderModel(opt, embedding, state_dict)

if args.cuda:
    model.cuda()

In [10]:

with open('SQuAD/meta.msgpack', 'rb') as f:
    meta = msgpack.load(f, encoding='utf8')
embedding = torch.Tensor(meta['embedding'])
opt['pretrained_words'] = True
opt['vocab_size'] = embedding.size(0)
opt['embedding_dim'] = embedding.size(1)
opt['pos_size'] = len(meta['vocab_tag'])
opt['ner_size'] = len(meta['vocab_ent'])
with open(args.data_file, 'rb') as f:
    data = msgpack.load(f, encoding='utf8')
train = data['train']
data['dev'].sort(key=lambda x: len(x[1]))
dev = [x[:-1] for x in data['dev']]
dev_y = [x[-1] for x in data['dev']]
#return train, dev, dev_y, embedding, opt

SyntaxError: 'return' outside function (<ipython-input-10-e39b283d226f>, line 16)

In [13]:
print(data['dev'][0])

['067438', [944, 1045, 82, 6439, 821, 1907, 29292, 920, 928, 31, 6, 1668, 18, 51, 35042, 825, 22, 2118, 52621, 7, 6473, 42205, 2454, 216], [[False, False, False, 0.08333333333333333], [True, True, True, 0.041666666666666664], [True, True, True, 0.041666666666666664], [False, False, True, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, True, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.08333333333333333], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, False, 0.041666666666666664], [False, False, 

In [130]:

batches = BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda)
predictions = []
for batch in batches:
    predictions.extend(model.predict(batch))
em, f1 = score(predictions, dev_y)
log.info("[dev EM: {} F1: {}]".format(em, f1))
best_val_score = em

11/10/2017 05:15:23 [dev EM: 59.5080184832835 F1: 72.54025201624347]


In [131]:
#print(answer_single.keys())
import string
import re
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))


#ret='Id,Answer\r\n'
#for i in range(len(predictions)):
#    ret += str(data['dev'][i][0]) + ','+normalize_answer(predictions[i])+'\r\n'
    
ret='Id,Answer\r\n'
for i in range(len(predictions)):
    ret += str(data['dev'][i][0]) + ','+normalize_answer(predictions[i])+'\r\n'
f=open('/home/zawlin/answers/6.csv','w')
f.write(ret)
f.close()


In [137]:

import pandas as pd

def get_ans_dict(aa,path):    
    lines = tuple(open(path, 'r'))[1:]
    ret = []
    for l in lines:
        ls = l.split(',')
        if ls[0] in aa:
            aa[ls[0]].append(ls[1].strip())
        else:
            aa[ls[0]]=[]
            aa[ls[0]].append(ls[1].strip())
                
def get_ans_dict1(aa,path):    
    lines = tuple(open(path, 'r'))[1:]
    ret = []
    for l in lines:
        ls = l.split(',')
        assert ls[0] in aa,str(ls[0])
        if ls[0] in aa:
            aa[ls[0]].append(ls[1].strip())
        else:
            aa[ls[0]]=[]
            aa[ls[0]].append(ls[1].strip())
                
def fill_gt(aa,path):    
    lines = tuple(open(path, 'r'))[1:]
    ret = []
    for l in lines:
        ls = l.split(',')
        aa[ls[0]].append(ls[1].strip())
ans_all = {}

get_ans_dict(ans_all,'/home/zawlin/answers/1.csv')
get_ans_dict(ans_all,'/home/zawlin/answers/2.csv')
get_ans_dict(ans_all,'/home/zawlin/answers/3.csv')
get_ans_dict(ans_all,'/home/zawlin/answers/4.csv')
get_ans_dict(ans_all,'/home/zawlin/answers/5.csv')
get_ans_dict(ans_all,'/home/zawlin/answers/6.csv')

fill_gt(ans_all,'/home/zawlin/answer_gt.csv')
w = [0,0,0,0,0,0]

def find_auto_weight(w,ans_all):
    
    for i in range(len(w)):
        
        total = float(len(ans_all.keys()))
        correct = 0.
        for a in ans_all.keys():   
            ans = ans_all[a]
            #print(len(ans))
            gt = ans[-1]
            
            #print(predict)
            if gt == ans[i]:
                correct += 1
                #print(ans)
                
        w[i]=correct/total
find_auto_weight(w,ans_all)
print(w)
#ans_gt = get_ans_dict('/home/zawlin/answer_gt.csv')
#for a in pd.read_csv('/home/zawlin/answers/1.csv'):
 #   print(a)
  #  break
    
#pd.read_csv('/home/zawlin/answers/2.csv')
#pd.read_csv('/home/zawlin/answers/3.csv')

#answer_gt = pd.read_csv('/home/zawlin/answer_gt.csv')

[0.5840717586300626, 0.5939113889643926, 0.5438434357162273, 0.5432182658331068, 0.5933677629790703, 0.5833922261484099]


In [201]:
import operator
total = float(len(ans_all.keys()))
correct = 0.
dup =0.
w[:]=w_orig[:]
#w[0]=1.1
#w[1]*=.8
#w[2]*=1.1
#w[4]*=1.2
#w[5]*=.9
#w[5]*=.8
#w[1]*=1.1
for a in ans_all.keys():   
    ans = ans_all[a]
    gt = ans[-1]
    voter = {}
    #print(ans)
    for i in range(len(w)):
        if ans[i] in voter:
            voter[ans[i]] +=  w[i]
        else:
            voter[ans[i]] = w[i]
    sorted_x = sorted(voter.items(), key=operator.itemgetter(1))[::-1]
    
    predict = sorted_x[0][0]
    
    if gt == predict:
        correct += 1
    else:
        pass
        #print(ans)
print(dup/total)
print(correct)
print(correct/total)
    

0.0
22624.0
0.6149497145963577


In [194]:
import operator
total = float(len(ans_all.keys()))
correct = 0.
dup =0.
ret = 'Id,Answer\r\n'
for a in ans_all.keys():   
    ans = ans_all[a]
    voter = {}
    for i in range(len(w)):
        if ans[i] in voter:
            voter[ans[i]] +=  w[i]
        else:
            voter[ans[i]] = w[i]
    sorted_x = sorted(voter.items(), key=operator.itemgetter(1))[::-1]
    
    predict = sorted_x[0][0]
    
    ret+=a + ',' +predict+'\r\n'

f=open('/home/zawlin/answer_f2.csv','w')
f.write(ret)
f.close()
    

In [202]:
print(w)

[0.5840717586300626, 0.5939113889643926, 0.5438434357162273, 0.5432182658331068, 0.5933677629790703, 0.5833922261484099]
