<span style="font-family: Arial; font-size: 14pt;"><b>Intermediate Training and Fine-tuning of BERT on Geneva Bible</b></span><br><br>
<span style="font-family: Arial; font-size: 10pt;">Author: Lucas Ma</span>

In [5]:
import os
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict

# Step 1: Load Pre-trained Model and Tokenizer
modelName = "emanjavacas/MacBERTh"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForMaskedLM.from_pretrained(modelName)

# Step 2: Prepare the Dataset
def load_and_tokenize_dataset(file_path, tokenizer, block_size=128):
    # Read lines from the file
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Create a dataset from lines
    dataset = Dataset.from_dict({"text": lines})
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=block_size)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

bible_text = "data/bible_full_text.txt"
virginia_docs = "data/A10010.txt"

# Load and tokenize datasets
bible_dataset = load_and_tokenize_dataset(bible_text, tokenizer)
virginia_dataset = load_and_tokenize_dataset(virginia_docs, tokenizer)

# Concatenate the datasets
#combined_dataset = DatasetDict({"train": torch.utils.data.ConcatDataset([bible_dataset, virginia_dataset])})
combined_dataset = bible_dataset
combined_dataset = DatasetDict({"train": combined_dataset})

# Step 3: Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Step 4: Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Step 5: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=combined_dataset["train"]
)

# Step 6: Train
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-MacBERTh')
tokenizer.save_pretrained('./fine-tuned-MacBERTh')




Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 2.5776, 'train_samples_per_second': 1.164, 'train_steps_per_second': 1.164, 'train_loss': 0.3129093249638875, 'epoch': 3.0}


('./fine-tuned-MacBERTh/tokenizer_config.json',
 './fine-tuned-MacBERTh/special_tokens_map.json',
 './fine-tuned-MacBERTh/vocab.txt',
 './fine-tuned-MacBERTh/added_tokens.json',
 './fine-tuned-MacBERTh/tokenizer.json')