In [None]:
!pip install -U tokenizers
!pip install tensorflow-gpu==1.15

In [None]:
vocab_length = 50000


In [None]:
!wget --header="Host: codeload.github.com" \
    --header="User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36" \
    --header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" \
    --header="Accept-Language: en-GB,en-US;q=0.9,en;q=0.8" \
    --header="Referer: https://github.com/google-research/bert" \
    --header="Cookie: _octo=GH1.1.68793831.1588906101; _ga=GA1.2.19990328.1588906163; logged_in=no; _gat=1; tz=Asia%2FKarachi" \
    --header="Connection: keep-alive" "https://codeload.github.com/google-research/bert/zip/master" \
    -c -O 'bert-master.zip'

!unzip bert-master.zip
!rm bert-master.zip

In [None]:
!wget https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip \
    -c -O 'bert-base-uncased.zip'

!unzip bert-base-uncased.zip -d bert-base-uncased
!rm bert-base-uncased.zip

In [None]:
!mkdir data
!gdown --id 1dRYeLV7NvcN2GmYb3X0CdX93juTM2rQi -O data/
!gdown --id 102gHSTw_XxBs31XM6ZGe4KkVfV0VuSLy -O data/
!gdown --id 1OxvR5pdR5CgHBotf2YuLM0-_vFNe6u6a -O data/
!gdown --id 1Pofed4RbRlCQiDmv4MjNM0ogus0XErBB -O data/
!gdown --id 1tE8f4-c0ZqYQKNiAfEnfojcsZIxnvEzU -O data/
!gdown --id 1eTq3ngxff0Npt1for_i8iBgD66hvaNx8 -O data/

In [None]:
import glob

files = glob.glob('data/' + '*')

text_data = []
for file in files:
    with open(file, 'r') as data:
        text = list(filter(lambda x: x != '\n', data.readlines()))
        text_data.append(''.join(text))

with open('all_data.txt','w') as f:
    f.write('\n'.join(text_data))

In [None]:
import tokenizers

roman_BWPT = tokenizers.BertWordPieceTokenizer(
    # add_special_tokens=True, # This argument doesn't work in the latest version of BertWordPieceTokenizer
    unk_token='[UNK]',
    sep_token='[SEP]',
    cls_token='[CLS]',
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
    wordpieces_prefix='##'
)

roman_BWPT.train(
    files=["all_data.txt"],
    vocab_size=vocab_length,
    min_frequency=3,
    limit_alphabet=1000,
    show_progress=True,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]']
)

roman_BWPT.save_model(".", "roman-urdu-vocab-uncased_"+str(vocab_length))

In [None]:
english_vocab = open('bert-base-uncased/vocab.txt', 'r').read().split('\n')[:-1]
roman_urdu_vocab = open('roman-urdu-vocab-uncased_'+str(vocab_length)+'-vocab.txt', 'r').read().split('\n')[:-1]

common_vocab = list(set(english_vocab).intersection(set(roman_urdu_vocab)))
print('No. of common tokens: ',len(common_vocab))

augmented_vocab = [''] * len(roman_urdu_vocab)

for vocab in common_vocab:
    augmented_vocab[english_vocab.index(vocab)] = vocab
    roman_urdu_vocab.pop(roman_urdu_vocab.index(vocab))

for i in range(len(augmented_vocab)):
    if augmented_vocab[i] == '':
        augmented_vocab[i] = roman_urdu_vocab.pop(0)

with open('augmented_vocab.txt', 'w') as v:
    v.write('\n'.join(augmented_vocab))


In [None]:
BERT_BASE_DIR='bert-base-uncased'

import json
with open(BERT_BASE_DIR+'/bert_config.json', "r+") as jsonFile:
    data = json.load(jsonFile)
    data["vocab_size"] = sum(1 for line in open('augmented_vocab.txt'))
    jsonFile.seek(0)  # rewind
    json.dump(data, jsonFile)
    jsonFile.truncate()

In [None]:
# use this instead of below command if less than 64GB RAM
!mkdir data_parts pretraining_data
!split -C 100m --numeric-suffixes all_data.txt data_parts/all_data
!gdown --id 13UCnkCcLO30aw03n9t1chlYXybifuOeB
!chmod +x create_pretraining.sh
!./create_pretraining.sh

In [None]:
!python bert-master/create_pretraining_data.py \
    --input_file all_data.txt \
    --output_file tf_examples_multi.tfrecord \
    --vocab_file augmented_vocab.txt \
    --do_lower_case True \
    --max_seq_length 128 \
    --max_predictions_per_seq 20 \
    --masked_lm_prob 0.15 \
    --random_seed 42 \
    --dupe_factor 5

In [None]:
!python bert-master/run_pretraining.py \
    --input_file=tf_examples_multi.tfrecord \
    --output_dir=bert_bilingual_roman_urdu \
    --do_train=True \
    --do_eval=True \
    --bert_config_file=$BERT_BASE_DIR/bert_config.json \
    --train_batch_size=32 \
    --max_seq_length=128 \
    --max_predictions_per_seq=20 \
    --num_train_steps=100000 \
    --num_warmup_steps=10 \
    --learning_rate=2e-5 \
    --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \