In [1]:
import torch
from torch import nn
import torch.nn.functional as F

from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizer

import json
import pickle
import collections
import random

In [2]:
from config import parameters as conf
import sampling as sampling
from Main import *

Using roberta
Reading: /data2/yikyungkim/dataset/finqa_original/operation_list.txt
Reading: /data2/yikyungkim/dataset/finqa_original/constant_list.txt


In [3]:
from transformers import RobertaTokenizer
from transformers import RobertaConfig
tokenizer = RobertaTokenizer.from_pretrained(conf.model_size)
model_config = RobertaConfig.from_pretrained(conf.model_size)

In [4]:
model_dir_name = conf.model_save_name
model_dir = os.path.join(conf.output_path, model_dir_name)
results_path = os.path.join(model_dir, "results")
saved_model_path = os.path.join(model_dir, "saved_model")
os.makedirs(saved_model_path, exist_ok=True)       # for restart 
os.makedirs(results_path, exist_ok=True)
log_file = os.path.join(results_path, 'log.txt')

In [5]:
op_list = read_txt(conf.op_list_file, log_file)
op_list = [op + '(' for op in op_list]
op_list = ['EOF', 'UNK', 'GO', ')'] + op_list
const_list = read_txt(conf.const_list_file, log_file)
const_list = [const.lower().replace('.', '_') for const in const_list]
reserved_token_size = len(op_list) + len(const_list)

Reading: /data2/yikyungkim/dataset/finqa_original/operation_list.txt
Reading: /data2/yikyungkim/dataset/finqa_original/constant_list.txt


In [4]:
special_token = {'additional_special_tokens': ['[QNP]']}
num_added_toks = tokenizer.add_special_tokens(special_token)

In [5]:
train_data = load_data(conf.train_file, log_file)
kwargs_load={
    'finqa_dataset_path': conf.train_case,
    'constants_path': conf.const_list_file,
    'archive_path': conf.archive_path,
    'mode': 'train',
    'q_score_available': conf.q_score_available,
    'p_score_available': conf.p_score_available,
    'candidates_available': conf.candidates_available,
    'pos_pool': conf.pos_pool,
    'neg_pool': conf.neg_pool
}
input_data, q_scores, p_scores, gold_indices, constants, gold_cands, non_gold_cands = sampling.load_dataset(**kwargs_load)

Reading /data2/yikyungkim/dataset/finqa_retriever_output/train_retrieve.json
starts loading question score
starts loading program score and gold indices
starts loading question similar candidates


In [6]:
kwargs_sample={
    'seed': 0,
    'input_data': input_data,
    'p_scores': p_scores,
    'gold_cands': gold_cands,
    'non_gold_cands': non_gold_cands,
    'num_case': conf.num_case,
    'top3_precision': conf.top3_precision_val
}
train_case = sampling.get_random_cases(**kwargs_sample)

In [7]:
train_examples, op_list, const_list = \
    read_examples(input_data=train_data, case_data=train_case, tokenizer=tokenizer,
                op_list=op_list, const_list=const_list, log_file=log_file, 
                num_case=conf.num_case, input_concat=conf.input_concat, program_type=conf.program_type)


In [12]:
kwargs = {"examples": train_examples,
        "tokenizer": tokenizer,
        "max_seq_length": conf.max_seq_length,
        "max_program_length": conf.max_program_length,
        "is_training": True,
        "op_list": op_list,
        "op_list_size": len(op_list),
        "const_list": const_list,
        "const_list_size": len(const_list),
        "verbose": True}

train_features = convert_examples_to_features(**kwargs)

too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long


In [9]:
valid_data = load_data(conf.valid_file, log_file)
valid_case = load_data(conf.valid_case, log_file)

valid_examples, op_list, const_list = \
    read_examples(input_data=valid_data, case_data=valid_case, tokenizer=tokenizer,
                op_list=op_list, const_list=const_list, log_file=log_file, 
                num_case=conf.num_case, input_concat=conf.input_concat, program_type=conf.program_type)


Reading /data2/yikyungkim/dataset/finqa_retriever_output/dev_retrieve.json
Reading /data2/yikyungkim/case_retriever/inference/cross_roberta-base_q+p_100200/generator_input/valid_300/results/predictions.json


In [10]:
""" Convert to model input features """
kwargs = {"examples": train_examples,
        "tokenizer": tokenizer,
        "max_seq_length": conf.max_seq_length,
        "max_program_length": conf.max_program_length,
        "is_training": True,
        "op_list": op_list,
        "op_list_size": len(op_list),
        "const_list": const_list,
        "const_list_size": len(const_list),
        "verbose": True}

train_features = convert_examples_to_features(**kwargs)
kwargs["examples"] = valid_examples
kwargs["is_training"] = False
valid_features = convert_examples_to_features(**kwargs)

too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long
too long


In [16]:
train_case[0]['case_retrieved'][0]['program']

'divide(19557, const_1000), divide(2.2, #0), EOF'

In [15]:
train_case[0]['qa']['program']

'divide(100, 100), divide(3.8, #0)'

In [19]:
def program_tokenization(original_program):
    original_program = original_program.split(', ')
    program = []
    for tok in original_program:
        cur_tok = ''
        for c in tok:
            if c == ')':
                if cur_tok != '':
                    program.append(cur_tok)
                    cur_tok = ''
            cur_tok += c
            if c in ['(', ')']:
                program.append(cur_tok)
                cur_tok = ''
        if cur_tok != '':
            program.append(cur_tok)
    program.append('EOF')
    return program

# compute levenshtein distance
def levenshtein(s1, s2, debug=False):
    if len(s1) < len(s2):
        return levenshtein(s2, s1, debug)
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        if debug:
            print(current_row[1:])
        previous_row = current_row

    return previous_row[-1]

# get operators in program
def operator(program):
    ops=[]
    i=0
    while i < len(program):
        if program[i]!='EOF':
            ops.append(program[i])
        i+=4
    return ops

# get arguments in program
def arguments(program, constants):
    args=[]
    i=0
    while i < len(program):
        if i%4==1:
            if program[i] not in constants:
                args.append('arg1')
            else:
                # args.append(program[i])
                if 'const' in program[i]:
                    args.append('const')
                else:
                    args.append(program[i])
        elif i%4==2:
            if program[i] not in constants:
                args.append('arg2')
            else:
                # args.append(program[i])
                if 'const' in program[i]:
                    args.append('const')
                else:
                    args.append(program[i])
        i+=1
    return args

# compute program score
def distance_score(query, cand, constants, weight):
    query_program = program_tokenization(query)
    cand_program = program_tokenization(cand)

    ops_q = operator(query_program)
    ops_c = operator(cand_program)
    ops_len = len(ops_q)
    ops_distance = levenshtein(ops_q, ops_c)
    ops_score = (ops_len - ops_distance)/ops_len

    arg_q = arguments(query_program, constants)
    arg_c = arguments(cand_program, constants)
    arg_len = len(arg_q)
    arg_distance = levenshtein(arg_q, arg_c)
    arg_score = (arg_len - arg_distance)/arg_len
    
    return weight * ops_score + (1-weight) * arg_score