/
bert_base_uncased.yaml
114 lines (109 loc) 路 2.75 KB
/
bert_base_uncased.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
dependencies:
- name: 'custom'
datasets:
glue:
name: &dataset_name 'cola'
raw_data_kwargs:
train_file_path:
valid_file_path:
dataset_id_map:
train: &glue_train !join [*dataset_name, '/train']
validation: &glue_val !join [*dataset_name, '/val']
test: &glue_test !join [*dataset_name, '/test']
models:
student_model:
key: &student_model_key 'bert-base-uncased'
model_name_or_path: *student_model_key
config_kwargs:
pretrained_model_name_or_path: *student_model_key
num_labels: 2
tokenizer_kwargs:
pretrained_model_name_or_path: *student_model_key
do_lower: True
use_fast: True
model_kwargs:
pretrained_model_name_or_path: *student_model_key
_experiment: &student_experiment !join [*dataset_name, '-', *student_model_key]
src_ckpt:
dst_ckpt: !join ['./resource/ckpt/glue/', *dataset_name, '/ce/', *student_experiment]
train:
log_freq: 50
num_epochs: 3
train_data_loader:
dataset_id: *glue_train
sampler:
class_or_func: !import_get
key: 'torch.utils.data.RandomSampler'
kwargs:
kwargs:
batch_size: 16
num_workers: 0
pin_memory: True
drop_last: False
collate_fn: 'DataCollatorWithPadding'
requires_supp: False
cache_output:
val_data_loader:
dataset_id: *glue_val
sampler: &val_sampler
class_or_func: !import_get
key: 'torch.utils.data.SequentialSampler'
kwargs:
kwargs:
batch_size: 32
num_workers: 0
pin_memory: True
drop_last: False
collate_fn: 'DataCollatorWithPadding'
requires_supp: False
model:
forward_proc: 'forward_batch_as_kwargs'
adaptations:
sequential: []
wrapper:
requires_grad: True
frozen_modules: []
optimizer:
key: 'optimizer_no_decay'
kwargs:
optimizer_key: 'AdamW'
lr: 5.0e-5
weight_decay: 0.0
filters_params: False
max_grad_norm: 1.0
grad_accum_step: 1
scheduler:
key: 'get_linear_schedule_with_warmup'
kwargs:
num_warmup_steps: 0
num_training_steps:
scheduling_step: 1
criterion:
key: 'WeightedSumLoss'
func2extract_model_loss: 'extract_transformers_loss'
kwargs:
model_term:
weight: 1.0
test:
test_data_loader:
dataset_id: *glue_val
sampler: *val_sampler
kwargs:
batch_size: 32
num_workers: 0
pin_memory: True
drop_last: False
collate_fn: 'DataCollatorWithPadding'
private:
- private_data_loader:
dataset_id: *glue_test
task_name: cola
sampler: *val_sampler
kwargs:
batch_size: 32
num_workers: 0
pin_memory: True
drop_last: False
collate_fn: 'DataCollatorWithPadding'
idx2str: False
pred_output: 'CoLA.tsv'