In [10]:
import os
import shutil
from google.cloud import storage
from tokenizers import ByteLevelBPETokenizer

#--------------------------------------------------------------------------------------

tokenizerBucketName = 'mpr-research-tokenizers'
modelBucketName = 'mwrite-data-bucket-1'

client = storage.Client()

tokenizerParentFolder = 'peerBERT_tokenizers'
if not os.path.exists(tokenizerParentFolder):
    os.makedirs(tokenizerParentFolder)
    
    
print('Retrieiving tokenizer builder text files...')
tokenizerTrainerDataFolder = 'PeerBERT_training_text'
if not os.path.exists(tokenizerTrainerDataFolder):
    os.makedirs(tokenizerTrainerDataFolder)
    
fileBucket = client.get_bucket((modelBucketName))

for file in fileBucket.list_blobs(prefix=tokenizerTrainerDataFolder):
    try:
        file.download_to_filename(file.name)
    except:
        continue
print('Tokenizer builder retrieved.')
    
#--------------------------------------------------------------------------------------


def tokenizerMaker(vocabularySize):
    
    print(f'Building tokenizer for vocabulary size {vocabularySize}...')
    
    tokenizerFolder = str(vocabularySize)
    if not os.path.exists(os.path.join(tokenizerParentFolder,tokenizerFolder)):
        os.makedirs(os.path.join(tokenizerParentFolder,tokenizerFolder))
        
    baseTokenizer = ByteLevelBPETokenizer(lowercase=False)

    pathList = [os.path.join(tokenizerTrainerDataFolder,file) for file in os.listdir(tokenizerTrainerDataFolder)]

    baseTokenizer.train(files=pathList, vocab_size=vocabularySize, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
        ] )

    baseTokenizer.save_model(os.path.join(tokenizerParentFolder,tokenizerFolder))
    
    try:
        tokenizerBucket = client.bucket(tokenizerBucketName)
        localTokenizerPath = os.path.join(tokenizerParentFolder,tokenizerFolder)
        files = [f for f in os.listdir(localTokenizerPath) if os.path.isfile(os.path.join(localTokenizerPath, f))]

        for file in files:
            localFile = os.path.join(localTokenizerPath, file)
            blob = tokenizerBucket.blob("/".join([tokenizerFolder, file]))
            blob.upload_from_filename(localFile)
        print(f'Saved model files in gs://{tokenizerBucketName}/{tokenizerFolder}')

    except Exception as e:
        print(f'Error: {e}')
        print(f'Saved tokenizer files instead locally at {os.path.join(tokenizerParentFolder,tokenizerFolder)}.')

    

for vocabularySize in [30522, 50265]:
    tokenizerMaker(vocabularySize)
    
shutil.rmtree(tokenizerParentFolder)
shutil.rmtree(tokenizerTrainerDataFolder)

Retrieiving tokenizer builder text files...
Tokenizer builder retrieved.
Building tokenizer for vocabulary size 30522...



Saved model files in gs://mpr-research-tokenizers/30522
Building tokenizer for vocabulary size 50265...



Saved model files in gs://mpr-research-tokenizers/50265
