upload gluon_electra_small_owt

zheyuye · Jun 29, 2020 · 1fb8eb8 · 1fb8eb8
1 parent ca83fac
commit 1fb8eb8
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 16 deletions.
diff --git a/scripts/pretraining/README.md b/scripts/pretraining/README.md
@@ -8,20 +8,22 @@ python preprocesse_owt.py --input prepared_owt --output preprocessed_owt --shuff
 The above command allows us to generate the preprocessed Numpy features saved in `.npz`.
 # Pretrain Model
 ## ELECTRA
+Following [Official Quickstart](https://github.com/google-research/electra#quickstart-pre-train-a-small-electra-model), pretrain a small model using OpenWebText as pretraining corpus. Note that [horovod](https://github.com/horovod/horovod) needs to be installed in advance, if `comm_backend` is set to `horovod`.
 
 ```bash
-horovodrun -np 8 -H localhost:8 python -m run_electra \
+horovodrun -np 2 -H localhost:2 python -m run_electra \
     --model_name google_electra_small \
     --data `preprocessed_owt/*.npz` \
-    --gpus 0,1,2,3,4,5,6,7 \
+    --generator_units_scale 0.25 \
+    --gpus 0,1 \
     --do_train \
     --do_eval \
     --output_dir ${OUTPUT} \
-    --num_accumulated ${ACCMULATE} \
-    --batch_size ${BS} \
-    --lr ${LR} \
-    --wd ${WD} \
-    --max_seq_len ${MSL} \
+    --num_accumulated 1 \
+    --batch_size 64 \
+    --lr 5e-4 \
+    --wd 0.01 \
+    --max_seq_len 128 \
     --max_grad_norm 1 \
     --warmup_steps 10000 \
     --num_train_steps 1000000 \
@@ -31,21 +33,22 @@ horovodrun -np 8 -H localhost:8 python -m run_electra \
     --comm_backend horovod \
 ```
 
-Or we could preprocessing the features on the fly based on the `.txt` files like
+Alternatively, we could preprocessing the features on the fly and train this model with raw text directly like
 ```bash
-horovodrun -np 8 -H localhost:8 python -m run_electra \
+horovodrun -np 2 -H localhost:2 python -m run_electra \
     --model_name google_electra_small \
+    --generator_units_scale 0.25 \
     --data `prepared_owt/*.txt` \
     --from_raw \
-    --gpus 0,1,2,3,4,5,6,7 \
+    --gpus 0,1 \
     --do_train \
     --do_eval \
     --output_dir ${OUTPUT} \
-    --num_accumulated ${ACCMULATE} \
-    --batch_size ${BS} \
-    --lr ${LR} \
-    --wd ${WD} \
-    --max_seq_len ${MSL} \
+    --num_accumulated 1 \
+    --batch_size 64 \
+    --lr 5e-4 \
+    --wd 0.01 \
+    --max_seq_len 128 \
     --max_grad_norm 1 \
     --warmup_steps 10000 \
     --num_train_steps 1000000 \
@@ -54,3 +57,45 @@ horovodrun -np 8 -H localhost:8 python -m run_electra \
     --mask_prob 0.15 \
     --comm_backend horovod \
 ```
+
+For the convenience of verification, the pretrained small model trained on OpenWebText named `gluon_electra_small_owt` is released and uploaded to S3 with directory structure as
+
+```
+gluon_electra_small_owt
+├── vocab-{short_hash}.json    
+├── model-{short_hash}.params
+├── model-{short_hash}.yml    
+├── gen_model-{short_hash}.params   
+├── disc_model-{short_hash}.params
+```
+
+After pretraining, several downstream NLP tasks such as Question Answering are available to fine-tune. Here is an example of fine-tuning a local pretrained model on [SQuAD 1.1/2.0](../question_answering#squad).
+
+```bash
+python run_squad.py \
+    --model_name google_electra_small \
+    --data_dir squad \
+    --backbone_path ${OUTPUT}/model-{short_hash}.params \
+    --output_dir ${FINE-TUNE_OUTPUT} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 32 \
+    --num_accumulated 1 \
+    --gpus 0 \
+    --epochs 2 \
+    --lr 3e-4 \
+    --layerwise_decay 0.8 \
+    --warmup_ratio 0.1 \
+    --max_saved_ckpt 6 \
+    --all_evaluate \
+    --wd=0 \
+    --max_seq_length 128 \
+    --max_grad_norm 0.1 \
+```
+
+Resulting in the following output
+
+| Model Name    | SQuAD1.1 dev  | SQuAD2.0 dev |
+|--------------------------|---------------|--------------|
+|gluon_electra_small_owt   | 69.40/76.98   | 67.63/69.89  |
diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
@@ -634,7 +634,7 @@ def train(args):
             logging.info(
                 'Step: {}/{}, Loss span/answer/total={:.4f}/{:.4f}/{:.4f},'
                 ' LR={:.8f}, grad_norm={:.4f}. Time cost={:.2f}, Throughput={:.2f} samples/s'
-                ' ETA={:.2f}h'.format((step_num + 1), epoch_size, log_span_loss,
+                ' ETA={:.2f}h'.format((step_num + 1), num_train_steps, log_span_loss,
                                       log_answerable_loss, log_total_loss, trainer.learning_rate, total_norm,
                                       toc - tic, log_sample_num / (toc - tic),
                                       (num_train_steps - (step_num + 1)) / ((step_num + 1) / (toc - global_tic)) / 3600))

diff --git a/src/gluonnlp/models/electra.py b/src/gluonnlp/models/electra.py
@@ -87,6 +87,13 @@ def get_generator_cfg(model_config):
         'params': 'google_electra_large/model-9baf9ff5.params',
         'disc_model': 'google_electra_large/disc_model-5b820c02.params',
         'gen_model': 'google_electra_large/gen_model-667121df.params',
+    },
+    'gluon_electra_small_owt':{
+        'cfg': 'gluon_electra_small_owt/model-6e276d98.yml',
+        'vocab': 'gluon_electra_small_owt/vocab-e6d2b21d.json',
+        'params': 'gluon_electra_small_owt/model-e9636891.params',
+        'disc_model': 'gluon_electra_small_owt/disc_model-87836017.params',
+        'gen_model': 'gluon_electra_small_owt/gen_model-45a6fb67.params',
     }
 }
 

diff --git a/src/gluonnlp/models/model_zoo_checksums/electra.txt b/src/gluonnlp/models/model_zoo_checksums/electra.txt
@@ -13,3 +13,8 @@ google_electra_large/model-9baf9ff5.params           9baf9ff55cee0195b7754aee7fc
 google_electra_large/gen_model-667121df.params       667121df73c6b521687f96894316dbc9adf27126   205211944
 google_electra_large/model-31b7dfdd.yml              31b7dfdd343bd2b2e43e200a735c83b0af1963f1   476
 google_electra_large/disc_model-5b820c02.params      5b820c026aa2ad779c1e9a41ff4ff1408fefacbf   1340602227
+gluon_electra_small_owt/vocab-e6d2b21d.json          e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
+gluon_electra_small_owt/model-e9636891.params        e9636891daae9f2940b2b3210cca3c34c3d8f21e   53748654
+gluon_electra_small_owt/model-6e276d98.yml           6e276d98360fbb7c379d28bac34a3ca2918a90ab   473
+gluon_electra_small_owt/gen_model-45a6fb67.params    45a6fb67e1e6cb65d22b80498f2152ce9780d579   33926624
+gluon_electra_small_owt/disc_model-87836017.params   878360174ac71c3fdc7071be7835bea532c09b8d   54015367