In [0]:
import numpy as np
import json
import os

from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/My Drive")

# Install requirements
! pip install tensorflow_hub==0.7 tensorflow-gpu==1.15 sentencepiece
from albert import create_pretraining_data

In [0]:
# Check current working directory
os.chdir("NLU_Project")
os.getcwd()

'/content/gdrive/My Drive/NLU_Project'

In [0]:
# Need to create a raw text file to put all documents together and then 
# use function in albert to create pretraining data
# "CORD-19-research-challenge/comm_use_subset/comm_use_subset/pmc_json/",
# "CORD-19-research-challenge/custom_license/custom_license/pmc_json/",
# "CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/pmc_json/"
txtf = open("raw_pdfdata.txt","w+")
# Change data path to your data path
data_paths = ["CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/", 
              "CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/",
              "CORD-19-research-challenge/custom_license/custom_license/pdf_json/",
              "CORD-19-research-challenge/noncomm_use_subset/noncomm_use_subset/pdf_json/"]
all_path_list = []
# Create a function to load all data
def load_data(data_paths, write_to_f):
    for folder in data_paths: # All data folders
        for f in os.listdir(folder):
            all_path_list.append(folder+f) # keep track so we don't miss anything
            with open(folder+f) as data:
                samp = json.load(data)
                # Split sentences (one sentence per line) and give a line break for each section
                for section in samp["body_text"]:
                    sentences = section["text"].split(".")
                    for sen in sentences:
                        if sen.strip():
                            write_to_f.write(sen.strip() + ".\n")
                        else:
                            write_to_f.write("\n") # Section line break

# Now we load all data
load_data(data_paths, txtf)
#txtf.close()

In [0]:
# for f in os.listdir('CORD-19-research-challenge/comm_use_subset/comm_use_subset/pdf_json/'):
#     print(f)
#     break
len(all_path_list)

59311

In [0]:
# Ignore this part since we are using ALBERT's vocab and tokenizer
# Create vocab for our model
# See this page: https://github.com/kwonmha/bert-vocab-builder

# ! python ./bert-vocab-builder/subword_builder.py \
# --corpus_filepattern "raw_data.txt" \
# --output_filename "vocab.txt" \
# --min_count 1

## Or use sentencepiece(which I used)
import sentencepiece as spm
spm.SentencePieceTrainer.Train('--input=raw_pdfdata.txt --model_prefix=covid19 --vocab_size=60000 \
                                --pad_id=0 --unk_id=1 --eos_id=-1 --bos_id=-1 \
                                --control_symbols=[CLS],[SEP],[MASK] \
                                --user_defined_symbols="(,),\",-,.,–,£,€" \
                                --shuffle_input_sentence=true --input_sentence_size=1000000 \
                                --character_coverage=0.99995 --model_type=unigram')

True

In [0]:
# Create pretraining data
! python -m albert.create_pretraining_data \
  --input_file "raw_pdfdata.txt" \
  --output_file "pre_train_data" \
  --vocab_file "3/assets/30k-clean.vocab" \
  --spm_model_file "3/assets/30k-clean.model"\
  --dupe_factor 5

In [0]:
# Side note

# I think it's better for us to do everything on colab...
# To download the dataset
# Do:

! pip install kaggle

# Need to get a kaggle API for this to work fine
# FYI, see https://github.com/Kaggle/kaggle-api
from google.colab import drive
drive.mount('/content/gdrive')
!cp kaggle.json ~/.kaggle/kaggle.json
! kaggle datasets download allen-institute-for-ai/CORD-19-research-challenge

In [0]:
! unzip CORD-19-research-challenge.zip

In [0]:
! python -m albert.run_pretraining \
    --input_file "pre_train_data_same_v" \
    --output_dir "train.record" \
    --albert_config_file "albert_config.json" \
    --albert_hub_module_handle "https://tfhub.dev/google/albert_base/3" \
    --do_train \
    --do_eval \
    --train_batch_size 64 \
    --eval_batch_size 64 \
    --max_seq_length 512 \
    --max_predictions_per_seq 20 \
    --optimizer 'lamb' \
    --learning_rate 0.00176 \
    --num_train_steps 125000 \
    --num_warmup_steps 3125 \
    --save_checkpoints_steps 5000

In [0]:
# To pretrain our model on Google Cloud: below is an example usage (need to operate on google cloud shell)
# Environment
virtualenv cmle-env
source cmle-env/bin/activate

# Tpu
"config": {
    "tpuServiceAccount": "<account>"
  }

# Using self models
https://cloud.google.com/tpu/docs/tutorials/bert

# Upload files
https://cloud.google.com/storage/docs/uploading-objects

# Set up and training
ctpu up --tpu-size=v3-8 \
 --machine-type=n1-standard-8 \
 --zone=us-central1-b \
 --tf-version=1.15.2 \
 --name=pretrain_ca

gcloud compute ssh cov-alb-pre --zone=us-central1-b

export STORAGE_BUCKET=gs://co-albert
export TPU_NAME=cov-alb-pre

git clone https://github.com/Heimine/albert.git

python3 -m albert.run_pretraining \
    --input_file "${STORAGE_BUCKET}/Data/pre_train_data_same_v" \
    --output_dir "${STORAGE_BUCKET}/train_output/" \
    --albert_config_file "${STORAGE_BUCKET}/Data/albert_config.json" \
    --albert_hub_module_handle "https://tfhub.dev/google/albert_base/3" \
    --do_train \
    --do_eval \
    --use_tpu \
    --tpu_name "${TPU_NAME}" \
    --train_batch_size 512 \
    --eval_batch_size 32 \
    --max_seq_length 512 \
    --max_predictions_per_seq 20 \
    --optimizer 'lamb' \
    --learning_rate 0.00176 \
    --num_train_steps 100000 \
    --num_warmup_steps 500 \
    --save_checkpoints_steps 2000

# If start from a checkpoint
python3 -m albert.run_pretraining \
    --input_file "${STORAGE_BUCKET}/Data/pre_train_data_same_v" \
    --output_dir "${STORAGE_BUCKET}/train_output/" \
    --albert_config_file "${STORAGE_BUCKET}/Data/albert_config.json" \
    --init_checkpoint "${STORAGE_BUCKET}/train_output/" \
    --do_train \
    --do_eval \
    --use_tpu \
    --tpu_name "${TPU_NAME}" \
    --train_batch_size 512 \
    --eval_batch_size 32 \
    --max_seq_length 512 \
    --max_predictions_per_seq 20 \
    --optimizer 'lamb' \
    --learning_rate 0.00176 \
    --num_train_steps 100000 \
    --num_warmup_steps 500 \
    --save_checkpoints_steps 2000

# exit
ctpu delete --zone=us-central1-b --name=cov-alb-pre