From 5cadad33f94fa38dd5e45615cab9ad54ef60d159 Mon Sep 17 00:00:00 2001 From: Enwei Zhu Date: Fri, 12 Nov 2021 09:12:00 +0800 Subject: [PATCH 1/4] add SciBERT --- scripts/exp_launcher.py | 4 ++-- scripts/utils.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/exp_launcher.py b/scripts/exp_launcher.py index 3d2fcb7..5e5ab8d 100644 --- a/scripts/exp_launcher.py +++ b/scripts/exp_launcher.py @@ -194,12 +194,12 @@ def call_command(command: str): else: sampler = OptionSampler(num_epochs=50, lr=[1e-3, 2e-3], - finetune_lr=[1e-5, 2e-5], + finetune_lr=[5e-5, 1e-4], batch_size=48, ck_decoder='span_classification', bert_drop_rate=0.2, use_interm2=[False, True], - bert_arch=['BERT_base', 'RoBERTa_base']) + bert_arch=['BERT_base', 'RoBERTa_base', 'SciBERT']) elif args.task == 'text2text': COMMAND = " ".join([COMMAND, "@scripts/options/tf2text.opt"]) diff --git a/scripts/utils.py b/scripts/utils.py index cb2edcd..1cd5e47 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -389,6 +389,11 @@ def load_pretrained(pretrained_str, args: argparse.Namespace, cased=False): return (transformers.BertModel.from_pretrained(PATH, hidden_dropout_prob=args.bert_drop_rate, attention_probs_dropout_prob=args.bert_drop_rate), transformers.BertTokenizer.from_pretrained(PATH, model_max_length=512, do_lower_case=False)) + elif pretrained_str.lower().startswith('scibert'): + PATH = "assets/transformers/allenai/scibert_scivocab_cased" if cased else "assets/transformers/allenai/scibert_scivocab_uncased" + return (transformers.BertModel.from_pretrained(PATH, hidden_dropout_prob=args.bert_drop_rate, attention_probs_dropout_prob=args.bert_drop_rate), + transformers.BertTokenizer.from_pretrained(PATH, model_max_length=512)) + elif args.language.lower() == 'chinese': if pretrained_str.lower().startswith('bert'): PATH = "assets/transformers/hfl/chinese-bert-wwm-ext" From c0f365c4d318c6db0916bf43399ff07e3df5509d Mon Sep 17 00:00:00 2001 From: Enwei Zhu Date: Mon, 15 Nov 2021 10:02:37 +0800 Subject: [PATCH 2/4] fix bugs --- data/ace-luan2019naacl/ace-luan2019naacl-process.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data/ace-luan2019naacl/ace-luan2019naacl-process.py b/data/ace-luan2019naacl/ace-luan2019naacl-process.py index 19fc3d2..c8abf48 100644 --- a/data/ace-luan2019naacl/ace-luan2019naacl-process.py +++ b/data/ace-luan2019naacl/ace-luan2019naacl-process.py @@ -35,6 +35,8 @@ 'head': spans.index((rel[0]-curr_start, rel[1]-curr_start+1)), 'tail': spans.index((rel[2]-curr_start, rel[3]-curr_start+1))} for rel in ex['relations'][k]] new_data.append(new_ex) + curr_start += len(new_ex['tokens']) + with open(trg_fn, 'w') as f: json.dump(new_data, f) From 4a2a1ac09f001af1e3bacaa4602e542a8372a229 Mon Sep 17 00:00:00 2001 From: Enwei Zhu Date: Mon, 15 Nov 2021 10:02:44 +0800 Subject: [PATCH 3/4] fix bugs --- scripts/exp_results_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/exp_results_collector.py b/scripts/exp_results_collector.py index a01caa1..ca872dd 100644 --- a/scripts/exp_results_collector.py +++ b/scripts/exp_results_collector.py @@ -10,7 +10,7 @@ dict_re = re.compile("\{[^\{\}]+\}") -metircs_re = {'acc': re.compile("(?<=Accuracy: )\d+\.\d+(?=%)"), +metrics_re = {'acc': re.compile("(?<=Accuracy: )\d+\.\d+(?=%)"), 'micro_f1': re.compile("(?<=Micro F1-score: )\d+\.\d+(?=%)"), 'bleu4': re.compile("(?<=BLEU-4: )\d+\.\d+(?=%)")} From ed12d103f2b662b1582880cfeefbd05b259b7c65 Mon Sep 17 00:00:00 2001 From: Enwei Zhu Date: Mon, 15 Nov 2021 10:03:01 +0800 Subject: [PATCH 4/4] update scripts --- scripts/exp_launcher.py | 6 ++-- scripts/joint_extraction.py | 4 +-- scripts/utils.py | 59 ++++++++++++++++++++++++++++--------- 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/scripts/exp_launcher.py b/scripts/exp_launcher.py index 5e5ab8d..aaeef47 100644 --- a/scripts/exp_launcher.py +++ b/scripts/exp_launcher.py @@ -193,8 +193,10 @@ def call_command(command: str): ck_label_emb_dim=[10, 25]) else: sampler = OptionSampler(num_epochs=50, - lr=[1e-3, 2e-3], - finetune_lr=[5e-5, 1e-4], + # lr=[1e-3, 2e-3], + lr=numpy.logspace(-3.0, -2.5, num=100, base=10).tolist(), # 1e-3 ~ 3e-3 + # finetune_lr=[5e-5, 1e-4], + finetune_lr=numpy.logspace(-4.5, -4.0, num=100, base=10).tolist(), # 3e-5 ~ 1e-4 batch_size=48, ck_decoder='span_classification', bert_drop_rate=0.2, diff --git a/scripts/joint_extraction.py b/scripts/joint_extraction.py index 64f9242..ac28985 100644 --- a/scripts/joint_extraction.py +++ b/scripts/joint_extraction.py @@ -208,10 +208,10 @@ def save_callback(model): logger.info("Evaluating on dev-set") evaluate_joint_extraction(trainer, dev_set, has_attr=(args.attr_decoder!='None'), has_rel=(args.rel_decoder!='None'), eval_chunk_type_for_relation=True, batch_size=args.batch_size) - # evaluate_joint_extraction(trainer, dev_set, has_attr=(args.attr_decoder!='None'), has_rel=(args.rel_decoder!='None'), eval_chunk_type_for_relation=False, batch_size=args.batch_size) + evaluate_joint_extraction(trainer, dev_set, has_attr=(args.attr_decoder!='None'), has_rel=(args.rel_decoder!='None'), eval_chunk_type_for_relation=False, batch_size=args.batch_size) logger.info("Evaluating on test-set") evaluate_joint_extraction(trainer, test_set, has_attr=(args.attr_decoder!='None'), has_rel=(args.rel_decoder!='None'), eval_chunk_type_for_relation=True, batch_size=args.batch_size) - # evaluate_joint_extraction(trainer, test_set, has_attr=(args.attr_decoder!='None'), has_rel=(args.rel_decoder!='None'), eval_chunk_type_for_relation=False, batch_size=args.batch_size) + evaluate_joint_extraction(trainer, test_set, has_attr=(args.attr_decoder!='None'), has_rel=(args.rel_decoder!='None'), eval_chunk_type_for_relation=False, batch_size=args.batch_size) logger.info(" ".join(sys.argv)) logger.info(pprint.pformat(args.__dict__)) diff --git a/scripts/utils.py b/scripts/utils.py index 1cd5e47..e0394cb 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -119,6 +119,7 @@ def parse_to_args(parser: argparse.ArgumentParser): 'ace2005': 'English', 'conll2004': 'English', 'SciERC': 'English', + 'ace2005_rel': 'English', 'ResumeNER': 'Chinese', 'WeiboNER': 'Chinese', 'SIGHAN2006': 'Chinese', @@ -136,6 +137,9 @@ def parse_to_args(parser: argparse.ArgumentParser): 'flickr8k': 'English', 'flickr30k': 'English', 'mscoco': 'English'} +dataset2language.update({f'ADE_cv{k}': 'English' for k in range(10)}) +dataset2language.update({f'ace2004_rel_cv{k}': 'English' for k in range(5)}) + def load_data(args: argparse.Namespace): if args.dataset == 'conll2003': @@ -194,22 +198,49 @@ def load_data(args: argparse.Namespace): f"Corruption Retrieval F1-score: {ave_scores['micro']['f1']*100:2.3f}%") elif args.dataset == 'conll2004': - json_io = JsonIO(text_key='tokens', - chunk_key='entities', chunk_type_key='type', chunk_start_key='start', chunk_end_key='end', - relation_key='relations', relation_type_key='type', relation_head_key='head', relation_tail_key='tail', - case_mode='None', number_mode='Zeros') - train_data = json_io.read("data/conll2004/conll04_train.json") - dev_data = json_io.read("data/conll2004/conll04_dev.json") - test_data = json_io.read("data/conll2004/conll04_test.json") + io = JsonIO(text_key='tokens', + chunk_key='entities', chunk_type_key='type', chunk_start_key='start', chunk_end_key='end', + relation_key='relations', relation_type_key='type', relation_head_key='head', relation_tail_key='tail', + case_mode='None', number_mode='Zeros') + train_data = io.read("data/conll2004/conll04_train.json") + dev_data = io.read("data/conll2004/conll04_dev.json") + test_data = io.read("data/conll2004/conll04_test.json") elif args.dataset == 'SciERC': - json_io = JsonIO(text_key='tokens', - chunk_key='entities', chunk_type_key='type', chunk_start_key='start', chunk_end_key='end', - relation_key='relations', relation_type_key='type', relation_head_key='head', relation_tail_key='tail', - case_mode='None', number_mode='Zeros') - train_data = json_io.read("data/SciERC/scierc_train.json") - dev_data = json_io.read("data/SciERC/scierc_dev.json") - test_data = json_io.read("data/SciERC/scierc_test.json") + io = JsonIO(text_key='tokens', + chunk_key='entities', chunk_type_key='type', chunk_start_key='start', chunk_end_key='end', + relation_key='relations', relation_type_key='type', relation_head_key='head', relation_tail_key='tail', + case_mode='None', number_mode='Zeros') + train_data = io.read("data/SciERC/scierc_train.json") + dev_data = io.read("data/SciERC/scierc_dev.json") + test_data = io.read("data/SciERC/scierc_test.json") + + elif args.dataset.startswith('ADE_cv'): + io = JsonIO(text_key='tokens', + chunk_key='entities', chunk_type_key='type', chunk_start_key='start', chunk_end_key='end', + relation_key='relations', relation_type_key='type', relation_head_key='head', relation_tail_key='tail', + case_mode='None', number_mode='Zeros') + k = int(args.dataset.replace('ADE_cv', '')) + train_data = io.read(f"data/ADE/ade_split_{k}_train.json") + dev_data = [] + test_data = io.read(f"data/ADE/ade_split_{k}_test.json") + args.train_with_dev = True + + elif args.dataset.startswith('ace2004_rel_cv'): + io = JsonIO(relation_key='relations', relation_type_key='type', relation_head_key='head', relation_tail_key='tail', + case_mode='None', number_mode='Zeros') + k = int(args.dataset.replace('ace2004_rel_cv', '')) + train_data = io.read(f"data/ace-luan2019naacl/ace04/cv{k}.train.json") + dev_data = [] + test_data = io.read(f"data/ace-luan2019naacl/ace04/cv{k}.test.json") + args.train_with_dev = True + + elif args.dataset == 'ace2005_rel': + io = JsonIO(relation_key='relations', relation_type_key='type', relation_head_key='head', relation_tail_key='tail', + case_mode='None', number_mode='Zeros') + train_data = io.read("data/ace-luan2019naacl/ace05/train.json") + dev_data = io.read("data/ace-luan2019naacl/ace05/dev.json") + test_data = io.read("data/ace-luan2019naacl/ace05/test.json") elif args.dataset == 'ResumeNER': conll_io = ConllIO(text_col_id=0, tag_col_id=1, scheme='BMES', encoding='utf-8', token_sep="", pad_token="")