In [None]:
!pip install -q transformers fugashi unidic_lite


In [1]:
DIR = 'jpRoberta'
!mkdir $DIR
!gsutil cp gs://jp-text-data/vocab.json $DIR/
!gsutil cp gs://jp-text-data/merges.txt $DIR/



Updates are available for some Cloud SDK components.  To install them,
please run:
  $ gcloud components update

Copying gs://jp-text-data/vocab.json...
- [1 files][715.4 KiB/715.4 KiB]                                                
Operation completed over 1 objects/715.4 KiB.                                    
Copying gs://jp-text-data/merges.txt...
/ [1 files][253.4 KiB/253.4 KiB]                                                
Operation completed over 1 objects/253.4 KiB.                                    


In [2]:
!ls $DIR

merges.txt  vocab.json


In [None]:
!gsutil cp gs://jp-text-data/jp-wiki-500k-sample.txt

In [None]:
from transformers import Trainer, TrainingArguments
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import IterableDataset
import torch

class MyTrainer(Trainer):
    def __init__(self):
        super().__init__()
        
    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        if is_tpu_available():
            train_sampler = get_tpu_sampler(self.train_dataset)
        else:
            train_sampler = (
                RandomSampler(self.train_dataset)
                if self.args.local_rank == -1
                else DistributedSampler(self.train_dataset)
            )
        data_loader = DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=train_sampler if not isinstance(self.train_dataset, IterableDataset) else None,
            collate_fn=self.data_collator.collate_batch,
        )
        return data_loader

In [None]:
from glob import glob
from robarta_japanese_tokenizer import RobertaJapaneseTokenizer

tokenizer = RobertaJapaneseTokenizer.from_pretrained(DIR,max_length=512)

In [None]:
class CustomIterableDataset(IterableDataset):
  def __init__(self, filename, tokenizer, block_size, len):
    self.filename = filename
    self.tokenizer = tokenizer
    self.block_size = block_size
    self.len = len 

  def preprocess(self, text):
    batch_encoding = self.tokenizer(text.strip("\n"), add_special_tokens=True, truncation=True, max_length=self.block_size)

    return torch.tensor(batch_encoding["input_ids"])

  def line_mapper(self, line):      
    return self.preprocess(line)

  def __iter__(self):
    file_itr = open(self.filename, encoding="utf-8")
    mapped_itr = map(self.line_mapper, file_itr)

    return mapped_itr

  def __len__(self):
    return self.len

dataset = CustomIterableDataset("jp-wiki-500k-sample.txt", tokenizer=tokenizer, block_size=256, len=500000)

In [None]:
from transformers import LineByLineTextDataset
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM



config = RobertaConfig(
    vocab_size=52000,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

model = RobertaForMaskedLM(config=config)
#model.resize_token_embeddings(len(tokenizer))

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="jpBert",
    overwrite_output_dir=True,
    num_train_epochs=1,
    #max_steps = 2500,
    warmup_steps = 500,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)


In [None]:
trainer.train()

In [None]:
trainer.save_model(DIR)

In [None]:
!gsutil cp jpRoberta/* gs://jp-text-data/jpRoberta/