python run-squad.py --restore_from aMLP-Summalizer-large-ja --pred_dataset summalize-testdata.json --verbose 

In [3]:
import json
import os
import sys
import numpy as np
import time
import shutil

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.optimizers import Adam

In [4]:

from modeling import model as build_model
from modeling import projection
from encoder import get_encoder
CHECKPOINT_DIR = 'checkpoint'

In [5]:
restore_from = r"E:\CloudSource\CloudSource_share\tharhtetsan\testing\aMLP-Summalizer-large-ja"
run_name = "aMLP-Summalizer-large-ja"

In [6]:
def read_squad_json(filename, to_val=False):
    with open(filename,encoding="utf-8") as f:
        squad = json.loads(f.read())
    context, question, answer_start, answer_end, question_id, answer = [], [], [], [], [], []
    num_quest = 0
    for data in squad["data"]:
        for p in data["paragraphs"]:
            c = p["context"]
            for q in p["qas"]:
                if "is_impossible" not in q or not q["is_impossible"]:
                    for a in (q["answers"][:1] if to_val else q["answers"]):
                        answer.append(a["text"])
                        context.append(c)
                        question.append(q["question"])
                        if "id" in q:
                            question_id.append(q["id"])
                        else:
                            question_id.append(str(num_quest))
                        answer_start.append(a["answer_start"])
                        answer_end.append(a["answer_start"]+len(a["text"]))
                        num_quest += 1
                elif not to_val:
                    answer.append("")
                    context.append(c)
                    question.append(q["question"])
                    if "id" in q:
                        question_id.append(q["id"])
                    else:
                        question_id.append(str(num_quest))
                    answer_start.append(-1)
                    answer_end.append(-1)
                    num_quest += 1
    print(f'read {len(context)} contexts from {filename}.')
    return context, question, answer_start, answer_end, question_id, answer


In [7]:
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def jaccard_wd(str1, str2):
    a = set(str1)
    b = set(str2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_best_indexes(logits, n_best_size):
    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
    best_indexes = []
    for i in range(len(index_and_score)):
        if i >= n_best_size:
            break
        best_indexes.append(index_and_score[i][0])
    return best_indexes

In [8]:
class squad_model(tf.keras.Model):
    def __init__(self, model, conf_dict):
        super(squad_model, self).__init__(name='squad_model')
        self.model = model
        self.projection = projection(conf_dict["num_hidden"], 2, name='squad_output')
    
    def call(self, inputs):
        input_ids, input_weights = inputs
        lm_output, _ = self.model(inputs=[input_ids, input_weights])
        logits = self.projection(lm_output)
        logits = tf.transpose(logits, [2, 0, 1])
        unstacked_logits = tf.unstack(logits, axis=0)
        start_logits, end_logits = unstacked_logits[0], unstacked_logits[1]
        return [start_logits, end_logits]

In [9]:
def crossentropy(labels, logits):
        num_vocabrary = logits.shape.as_list()[-1]
        flat_labels = tf.reshape(labels, [-1])
        flat_labels = tf.cast(flat_labels, tf.int32)
        flat_logits = tf.reshape(logits, [-1, num_vocabrary])
        one_hot_labels = tf.one_hot(flat_labels, depth=num_vocabrary, dtype=tf.float32)
        log_probs = tf.nn.log_softmax(flat_logits)
        loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
        loss = tf.reduce_mean(loss)
        return loss

In [10]:
bpe_path = os.path.join(restore_from, "vocabulary.txt")
hpm_path = os.path.join(restore_from,"hparams.json")

In [11]:
with open(hpm_path) as f:
        conf_dict = json.loads(f.read())

In [12]:
with open(bpe_path,encoding="utf-8") as f:
    ww = np.sum([1 if ('##' in l) else 0 for l in f.readlines()]) > 0
enc = get_encoder(bpe_path, 'emoji.json', ww)

In [13]:
vocab_size = conf_dict["num_vocab"]
EOT_TOKEN = vocab_size - 1
MASK_TOKEN = vocab_size - 2
SEP_TOKEN = vocab_size - 3
CLS_TOKEN = vocab_size - 4

batch_size = 4 # default


max_seq_length = conf_dict["num_ctx"]
max_predictions = 1
log_dir = ""
max_answer_length = 50
num_best_indexes = 20



pred_dataset = "summalize-testdata.json"
dataset = "summalize-testdata.json"

In [14]:
os.makedirs(os.path.join(CHECKPOINT_DIR,run_name), exist_ok=True)
strategy = tf.distribute.get_strategy()
print(f"Running on {strategy.num_replicas_in_sync} replicas")

Running on 1 replicas


In [15]:
class squad_model(tf.keras.Model):
    def __init__(self, model, conf_dict):
        super(squad_model, self).__init__(name='squad_model')
        self.model = model
        self.projection = projection(conf_dict["num_hidden"], 2, name='squad_output')
    
    def call(self, inputs):
        input_ids, input_weights = inputs
        lm_output, _ = self.model(inputs=[input_ids, input_weights])
        logits = self.projection(lm_output)
        logits = tf.transpose(logits, [2, 0, 1])
        unstacked_logits = tf.unstack(logits, axis=0)
        start_logits, end_logits = unstacked_logits[0], unstacked_logits[1]
        return [start_logits, end_logits]

In [16]:
def crossentropy(labels, logits):
        num_vocabrary = logits.shape.as_list()[-1]
        flat_labels = tf.reshape(labels, [-1])
        flat_labels = tf.cast(flat_labels, tf.int32)
        flat_logits = tf.reshape(logits, [-1, num_vocabrary])
        one_hot_labels = tf.one_hot(flat_labels, depth=num_vocabrary, dtype=tf.float32)
        log_probs = tf.nn.log_softmax(flat_logits)
        loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
        loss = tf.reduce_mean(loss)
        return loss

In [17]:
with strategy.scope():
    counter = 1
    lossmodel = tf.keras.models.load_model(restore_from, \
                    custom_objects={'crossentropy': crossentropy})

In [18]:
print('Loading dataset...')
def encode_json(filename):
    result_chunks = []
    for context, question, answer_start, answer_end, question_id, answer in zip(*read_squad_json(filename)):
        if len(question) > 0 and '？' not in question:
            question = question.replace('?', '？')
            if '？' not in question:
                question = question + '？'
        enc_context, ctx_posisions = enc.encode(context, clean=False, position=True)
        enc_question = enc.encode(question, clean=False, position=False)
        token_start = -1 if answer_start<0 else np.argmax(np.array(ctx_posisions+[1000000]) >= answer_start)
        token_end = 0 if answer_end<=0 else np.argmax(np.array(ctx_posisions+[1000000]) >= answer_end)
        ctx_offset = 1 + len(enc_question) + 2
        tokens = [CLS_TOKEN] + enc_question + [SEP_TOKEN, CLS_TOKEN] + enc_context + [EOT_TOKEN]
        tokens_weights = [1.0] * len(tokens)
        token_start = min(len(tokens)-2, token_start + ctx_offset)
        token_end = max(token_start, token_end + ctx_offset - 1)
        while len(tokens) < max_seq_length:
            tokens.append(EOT_TOKEN)
            tokens_weights.append(0.0)
        tokens = tokens[:max_seq_length]
        tokens_weights = tokens_weights[:max_seq_length]
        if token_start >= max_seq_length:
            token_start = ctx_offset-1
            token_end = ctx_offset-1
        elif token_end >= max_seq_length:
            token_end = max_seq_length-1
        answer = context[answer_start:answer_end]
        result_chunks.append({"tokens":tokens,"tokens_weights":tokens_weights,"token_start":token_start,"token_end":token_end,"question":question,
                              "ctx_offset":ctx_offset,"ctx_posisions":ctx_posisions,"context":context,"answer":answer,"question_id":question_id})
    return result_chunks

Loading dataset...


In [19]:
global_chunks = encode_json(dataset) #if do_training else None
global_chunk_index = 0
global_epochs = 0

read 2 contexts from summalize-testdata.json.


In [20]:
def pred(fn, chunks):
    data = []
    for preds in run_predict(chunks):
        answers = []
        context = preds["context"]
        question = preds["question"]
        for pred, pred_pos in zip(preds["predictionstrings"],preds["predictionpositions"]):
            if len(pred) > 0:
                answers.append({"text":pred,"answer_start":pred_pos})
        qas = {"id":preds["question_id"],"question":question,"is_impossible":preds["impossible"],"answers":answers}
        data.append({"paragraphs":[{"context":context,"qas":[qas]}]})
    with open(fn, "w", encoding="utf-8") as wf:
        wf.write(json.dumps({"data":data}, ensure_ascii=False , indent=2))


In [21]:
def run_predict(input_chunks):
            tokens,tokens_weights,ctx_offset,ctx_posisions,context,question_id,answer,question = [], [], [], [], [], [], [], []
            pp=[]
            for chunk in input_chunks:
                tokens.append(chunk["tokens"])
                tokens_weights.append(chunk["tokens_weights"])
                ctx_offset.append(chunk["ctx_offset"])
                ctx_posisions.append(chunk["ctx_posisions"])
                context.append(chunk["context"])
                question_id.append(chunk["question_id"])
                answer.append(chunk["answer"])
                question.append(chunk["question"])
                pp.append("true_y: %d %d"%(chunk["token_start"],chunk["token_end"]))
            tokens = np.array(tokens, dtype=np.int32)
            tokens_weights = np.array(tokens_weights, dtype=np.float32)
            pred = lossmodel.predict([tokens,tokens_weights], batch_size=batch_size)
            result = []
            pi = 0
            for starts, ends, off, pos, ctx, qid, ans, qes in zip(pred[0], pred[1], ctx_offset, ctx_posisions, context, question_id, answer, question):
                selected = []
                impossible = False
                p_starts = get_best_indexes(starts, num_best_indexes)
                p_ends = get_best_indexes(ends, num_best_indexes)
                pi += 1
                for start_index in p_starts:
                    for end_index in p_ends:
                        if start_index==off-1 and end_index==off-1 and len(selected)==0:
                            impossible = True
                        if start_index-off >= len(pos) or start_index<off:
                            continue
                        if end_index-off >= len(pos) or end_index<off:
                            continue
                        if end_index < start_index:
                            continue
                        length = end_index - start_index + 1
                        if length > max_answer_length:
                            continue
                        selected.append((start_index, end_index))
                predictionstrings = []
                predictionpositions = []
                for p_start,p_end in selected:
                    start_token = p_start-off
                    end_token = p_end-off
                    start_pos = pos[start_token]
                    end_pos = pos[end_token+1] if end_token+1<len(pos) else len(ctx)
                    predictionstrings.append(ctx[start_pos:end_pos])
                    predictionpositions.append(start_pos)
                result.append({"predictionstrings":predictionstrings, "predictionpositions":predictionpositions,
                               "impossible":impossible, "answer":ans, "question_id":qid, "context":ctx, "question":qes})
            return result

In [22]:
do_prediction = len(pred_dataset) > 0 and os.path.isfile(pred_dataset)
prediction_chunks = encode_json(pred_dataset) if do_prediction else None

read 2 contexts from summalize-testdata.json.


In [23]:
pred('squad-predicted.json', prediction_chunks)
result = encode_json('squad-predicted.json')
question_id = np.array([res["question_id"] for res in result])
question = [res["question"] for res in result]
answer = [res["answer"] for res in result]

read 270 contexts from squad-predicted.json.


In [25]:
index = np.arange(len(result))

print('Question\tAnswer')
for qid in np.unique(question_id):
    i = sorted(index[np.where(question_id == qid)])[0]

    print("[Context]")
    print(result[i]["context"])
    if len(question[i]) > 0:
        print("[Question]")
        print(question[i])
    print("[Answer]")
    print(answer[i])

Question	Answer
[Context]
東京株式市場において日経平均株価が値上がりし、3万670円10銭の値で終えた。株高の背景には新型コロナウイルス感染拡大の終息と景気回復への期待感があり、今後は企業業績の回復が焦点になる。日経平均株価が3万円の大台を回復するのは約30年半ぶり。関係者には過熱感を警戒する見方もあり、しばらくは国内外の感染状況を見ながらの取り引きが続きそう。トピックスも21円16銭値上がりし、2118円87銭で終える。出来高は13億3901万株。
[Answer]
日経平均株価が3万円の大台を回復するのは約30年半ぶり
[Context]
リーガ・エスパニョーラのレガネスはバジャドリードと対戦。23分、エリア内でバジャドリードのハンドによりPKを獲得するも惜しくも外れる。その後の30分にはオスカル・ロドリゲスが先制点を挙げる。1点ビハインドのバジャドリードは49分、エネス・ウナルがゴールを決めるがオフサイドの判定でゴールは取り消された。試合はそのままレガネスが1対0で逃げ切る。
[Answer]
試合はそのままレガネスが1対0で逃げ切る
