<span style="font-family: Arial; font-size: 14pt;"><b>Intermediate Training and Fine-tuning of BERT on Geneva Bible</b></span><br><br>
<span style="font-family: Arial; font-size: 10pt;">Author: Lucas Ma</span>\
<span style="font-family: Arial; font-size: 10pt;">Edited by: Jerry Zou</span>

In [3]:
import os, torch
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict

# Step 1: Load Pre-trained Model and Tokenizer
modelName = "emanjavacas/MacBERTh"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForMaskedLM.from_pretrained(modelName)

# Step 2: Prepare the Dataset
def load_and_tokenize_dataset(file_path, tokenizer, block_size=128):
    # Read lines from the file
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Create a dataset from lines
    dataset = Dataset.from_dict({"text": lines})
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=block_size)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

bible_text = "data/bible_full_text.txt"
virginia_docs = "data/A10010.txt"

# Load and tokenize datasets
bible_dataset = load_and_tokenize_dataset(bible_text, tokenizer)
virginia_dataset = load_and_tokenize_dataset(virginia_docs, tokenizer)

# Concatenate the datasets
#combined_dataset = DatasetDict({"train": torch.utils.data.ConcatDataset([bible_dataset, virginia_dataset])})
combined_dataset = bible_dataset
combined_dataset = DatasetDict({"train": combined_dataset})

# Step 3: Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Step 4: Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Step 5: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=combined_dataset["train"]
)

# Step 6: Train
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-MacBERTh')
tokenizer.save_pretrained('./fine-tuned-MacBERTh')


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 10.205, 'train_samples_per_second': 0.294, 'train_steps_per_second': 0.294, 'train_loss': 0.8662434418996176, 'epoch': 3.0}


('./fine-tuned-MacBERTh/tokenizer_config.json',
 './fine-tuned-MacBERTh/special_tokens_map.json',
 './fine-tuned-MacBERTh/vocab.txt',
 './fine-tuned-MacBERTh/added_tokens.json',
 './fine-tuned-MacBERTh/tokenizer.json')

Test: another approach to training the Geneva Bible through using CSV instead of TXT.

In [None]:
import torch, os
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import cosine_similarity
import pandas as pd
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict

2024-06-14 11:35:45.832792: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Process Geneva Bible CSV
csvFilePath = "PtMac_Bible/data/genevaBible.csv"
dataFrame = pd.read_csv(csvFilePath)
# dataFrame.head(10)
verses = dataFrame["Text"].tolist()
print(verses[:10])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emanjavacas/MacBERTh")
model = AutoModel.from_pretrained("emanjavacas/MacBERTh")

sentences = ["Example sentence for dimension checking."]

inputs = tokenizer(sentences, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)

sentence_embeddings = outputs.last_hidden_state[:, 0, :]

print(f"Shape of the sentence embeddings: {sentence_embeddings.shape}")