<span style="font-family: Arial; font-size: 14pt;"><b>Intermediate Training and Fine-tuning of BERT on Geneva Bible</b></span><br><br>
<span style="font-family: Arial; font-size: 10pt;">Author: Lucas Ma</span><br><br>
<span style="font-family: Arial; font-size: 10pt;"><b>Edit History:</b></span>\
<span style="font-family: Arial; font-size: 10pt;">Jerry Zou (Jun 15)<br>Lucas Ma (Jun 16)</span>

In [3]:
import os, torch
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict

# Step 1: Load Pre-trained Model and Tokenizer
modelName = "emanjavacas/MacBERTh"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForMaskedLM.from_pretrained(modelName)

# Step 2: Prepare the Dataset
def load_and_tokenize_dataset(file_path, tokenizer, block_size=128):
    # Read lines from the file
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Create a dataset from lines
    dataset = Dataset.from_dict({"text": lines})
    
    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=block_size)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

bible_text = "data/bible_full_text.txt"
virginia_docs = "data/A10010.txt"

# Load and tokenize datasets
bible_dataset = load_and_tokenize_dataset(bible_text, tokenizer)
virginia_dataset = load_and_tokenize_dataset(virginia_docs, tokenizer)

# Concatenate the datasets
#combined_dataset = DatasetDict({"train": torch.utils.data.ConcatDataset([bible_dataset, virginia_dataset])})
combined_dataset = bible_dataset
combined_dataset = DatasetDict({"train": combined_dataset})

# Step 3: Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Step 4: Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Step 5: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=combined_dataset["train"]
)

# Step 6: Train
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-MacBERTh')
tokenizer.save_pretrained('./fine-tuned-MacBERTh')


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 10.205, 'train_samples_per_second': 0.294, 'train_steps_per_second': 0.294, 'train_loss': 0.8662434418996176, 'epoch': 3.0}


('./fine-tuned-MacBERTh/tokenizer_config.json',
 './fine-tuned-MacBERTh/special_tokens_map.json',
 './fine-tuned-MacBERTh/vocab.txt',
 './fine-tuned-MacBERTh/added_tokens.json',
 './fine-tuned-MacBERTh/tokenizer.json')

Test: another approach to training the Geneva Bible through using CSV instead of TXT.

In [2]:
import torch, os
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import cosine_similarity
import pandas as pd
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict

In [4]:
# Process Geneva Bible CSV
csvFilePath = "./data/shortened_bible.csv"
dataFrame = pd.read_csv(csvFilePath)
# dataFrame.head(10)
verses = dataFrame["Text"].tolist()
print(verses[:10])

['In the beginning God created the heauen and the earth.', 'And the earth was without forme and voide, and darkenesse was vpon the deepe, and the Spirit of God moued vpon ye waters.', 'Then God saide, Let there be light: And there was light.', 'And God sawe the light that it was good, and God separated the light from the darkenes.', 'And God called the light, Day, and the darkenes, he called Night. So the euening and the morning were the first day.', 'Againe God saide, Let there be a firmament in the middes of the waters: and let it separate the waters from the waters.', 'Then God made the firmament, and separated the waters, which were vnder the firmament, from the waters which were aboue the firmament; it was so.', 'And God called the firmament Heauen. So the Euening and the morning were the seconde day.', 'God saide againe, Let the waters vnder the heauen be gathered into one place, and let the dry land appeare; it was so.', 'And God called the dry land, Earth, and he called the gat

In [5]:
tokenizer = AutoTokenizer.from_pretrained("emanjavacas/MacBERTh")
model = AutoModel.from_pretrained("emanjavacas/MacBERTh")

sentences = ["Example sentence for dimension checking."]

inputs = tokenizer(sentences, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)

sentence_embeddings = outputs.last_hidden_state[:, 0, :]

print(f"Shape of the sentence embeddings: {sentence_embeddings.shape}")



Shape of the sentence embeddings: torch.Size([1, 768])


The following code is used to fine tune MacBERTh on Bible saved in .csv. By Lucas Ma

In [3]:
import os
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# Load the dataset
df = pd.read_csv('data/genevaBible.csv')

# Initialize the tokenizer for MacBERTh
modelName = "emanjavacas/MacBERTh"
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForMaskedLM.from_pretrained(modelName)

# Tokenize the texts
texts = df['Text'].tolist()
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)

# Create a Dataset object
dataset = Dataset.from_dict(encodings)

# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

# Start training
trainer.train()

# Save the trained model and tokenizer
trainer.save_model('./fine-tuned-MacBERTh')
tokenizer.save_pretrained('./fine-tuned-MacBERTh')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/11664 [00:00<?, ?it/s]

{'loss': 1.4583, 'grad_norm': 24.043272018432617, 'learning_rate': 4.7856652949245545e-05, 'epoch': 0.13}
{'loss': 1.4668, 'grad_norm': 18.28766441345215, 'learning_rate': 4.571330589849109e-05, 'epoch': 0.26}
{'loss': 1.4698, 'grad_norm': 17.81636619567871, 'learning_rate': 4.3569958847736625e-05, 'epoch': 0.39}
{'loss': 1.4387, 'grad_norm': 17.122652053833008, 'learning_rate': 4.142661179698217e-05, 'epoch': 0.51}
{'loss': 1.4443, 'grad_norm': 12.819314002990723, 'learning_rate': 3.928326474622771e-05, 'epoch': 0.64}
{'loss': 1.3786, 'grad_norm': 19.872961044311523, 'learning_rate': 3.7139917695473254e-05, 'epoch': 0.77}
{'loss': 1.3927, 'grad_norm': 11.46866226196289, 'learning_rate': 3.49965706447188e-05, 'epoch': 0.9}
{'loss': 1.4125, 'grad_norm': 25.160419464111328, 'learning_rate': 3.285322359396434e-05, 'epoch': 1.03}
{'loss': 1.2937, 'grad_norm': 18.157798767089844, 'learning_rate': 3.0709876543209876e-05, 'epoch': 1.16}
{'loss': 1.2929, 'grad_norm': 18.392080307006836, 'learn

('./fine-tuned-MacBERTh/tokenizer_config.json',
 './fine-tuned-MacBERTh/special_tokens_map.json',
 './fine-tuned-MacBERTh/vocab.txt',
 './fine-tuned-MacBERTh/added_tokens.json',
 './fine-tuned-MacBERTh/tokenizer.json')