In [1]:
!pip install transformers numpy nltk pandas pyarrow



In [1]:
from pathlib import Path
from pprint import pprint

from transformers import BertTokenizer

from utils import (
    create_balanced_vocabulary,
    sharding,
    create_training_instances
)

In [2]:
tokenizer_path = './balanced_vocabulary/'
small_path = Path('./data/one_article_per_line/PubMedAbstract_1MB.txt')
large_path = Path('./data/one_article_per_line/Wiki_EN_10MB.txt')
sharded_size = 100000   # 100KB

## Create Balanced Vocabulary from two corpora.

In [3]:
create_balanced_vocabulary(
        small_path=small_path,
        large_path=large_path,
        output_dir=tokenizer_path,
        vocab_size=8000
    )






In [4]:
tokenizer = BertTokenizer(vocab_file=Path(tokenizer_path) / 'vocab.txt',
                          do_lower_case=True, do_basic_tokenize=True)
tokenizer.save_pretrained(tokenizer_path)

('./balanced_vocabulary/tokenizer_config.json',
 './balanced_vocabulary/special_tokens_map.json',
 './balanced_vocabulary/vocab.txt',
 './balanced_vocabulary/added_tokens.json')

## Text Sharding

In [5]:
# PubMed Abstract
sharding(
        input_files=small_path,
        output_file_prefix='PubMed1M',
        output_dir='./data/sharded_text/',
        n_shards=round(small_path.stat().st_size / sharded_size)  # 10 files
    )

Start: Init Output Files
End: Init Output Files
Start: Loading Articles
input file: data/one_article_per_line/PubMedAbstract_1MB.txt
End: Loading Articles: There are 725 articles.
Start: Sentence Segmentation
Segmenting article 0
End: Sentence Segmentation
Start: Distribute Articles Over Shards
Distributing data over shards: 710 articles remaining.
Distributing data over shards: 705 articles remaining.
Distributing data over shards: 698 articles remaining.
Distributing data over shards: 693 articles remaining.
Distributing data over shards: 687 articles remaining.
Distributing data over shards: 682 articles remaining.
Distributing data over shards: 676 articles remaining.
Distributing data over shards: 669 articles remaining.
Distributing data over shards: 662 articles remaining.
Distributing data over shards: 656 articles remaining.
Distributing data over shards: 651 articles remaining.
Distributing data over shards: 646 articles remaining.
Distributing data over shards: 641 articles 

In [6]:
# English Wiki corpus
sharding(
        input_files=large_path,
        output_file_prefix='Wiki10M',
        output_dir='./data/sharded_text/',
        n_shards=round(large_path.stat().st_size / sharded_size)  # 100 files
    )

Start: Init Output Files
End: Init Output Files
Start: Loading Articles
input file: data/one_article_per_line/Wiki_EN_10MB.txt
End: Loading Articles: There are 2196 articles.
Start: Sentence Segmentation
Segmenting article 0
End: Sentence Segmentation
Start: Distribute Articles Over Shards
Distributing data over shards: 2046 articles remaining.
Distributing data over shards: 1996 articles remaining.
Distributing data over shards: 1945 articles remaining.
Distributing data over shards: 1893 articles remaining.
Distributing data over shards: 1842 articles remaining.
Distributing data over shards: 1791 articles remaining.
Distributing data over shards: 1741 articles remaining.
Distributing data over shards: 1688 articles remaining.
Distributing data over shards: 1635 articles remaining.
Distributing data over shards: 1584 articles remaining.
Distributing data over shards: 1534 articles remaining.
Distributing data over shards: 1484 articles remaining.
Distributing data over shards: 1428 a

## (Demo) Create Instances for BERT Pretraining

In [7]:
composition = create_training_instances(
        small_corpus_dir='./data/sharded_text/PubMedAbstract_1MB/',
        large_corpus_dir='./data/sharded_text/Wiki_EN_10MB/',
        output_dir='./data/pretraining_instances',
        tokenizer_path=tokenizer_path,
        filename_prefix='sample',
        n_convoys=5,
        n_escorts=5,
        n_training_files=50
)

Number of instances: 867. Save to data/pretraining_instances/sample_32.txt
Number of instances: 841. Save to data/pretraining_instances/sample_18.txt
Number of instances: 834. Save to data/pretraining_instances/sample_30.txt
Number of instances: 845. Save to data/pretraining_instances/sample_26.txt
Number of instances: 852. Save to data/pretraining_instances/sample_37.txt
Number of instances: 839. Save to data/pretraining_instances/sample_42.txt
Number of instances: 847. Save to data/pretraining_instances/sample_25.txt
Number of instances: 850. Save to data/pretraining_instances/sample_35.txt
Number of instances: 840. Save to data/pretraining_instances/sample_22.txt
Number of instances: 845. Save to data/pretraining_instances/sample_23.txt
Number of instances: 855. Save to data/pretraining_instances/sample_49.txt
Number of instances: 840. Save to data/pretraining_instances/sample_29.txt
Number of instances: 860. Save to data/pretraining_instances/sample_16.txt
Number of instances: 819.

Number of instances: 884. Save to data/pretraining_instances/sample_0.txt
Number of instances: 867. Save to data/pretraining_instances/sample_39.txt
Number of instances: 838. Save to data/pretraining_instances/sample_1.txt
Number of instances: 861. Save to data/pretraining_instances/sample_6.txt
Number of instances: 856. Save to data/pretraining_instances/sample_20.txt
Number of instances: 846. Save to data/pretraining_instances/sample_34.txt
Number of instances: 857. Save to data/pretraining_instances/sample_46.txt
Number of instances: 861. Save to data/pretraining_instances/sample_3.txt
Number of instances: 841. Save to data/pretraining_instances/sample_4.txt
Number of instances: 842. Save to data/pretraining_instances/sample_44.txt
Number of instances: 891. Save to data/pretraining_instances/sample_48.txt
Number of instances: 841. Save to data/pretraining_instances/sample_13.txt
Number of instances: 826. Save to data/pretraining_instances/sample_36.txtNumber of instances: 873. Save 

In [None]:
pprint(composition)