In [10]:
# !wget -O temp_file.txt https://sacred-texts.com/bib/osrc/kjvdat.zip && mv temp_file.txt kjvdat.txt
# !unzip kjvdat.zip

In [11]:
import torch
import concurrent.futures
from book_titles import bible_book_titles
from transformers import AutoTokenizer, AutoModel

In [12]:
with open("kjvdat.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [13]:
print(bible_book_titles)

{'gen': 'Genesis', 'exo': 'Exodus', 'lev': 'Leviticus', 'num': 'Numbers', 'deu': 'Deuteronomy', 'jos': 'Joshua', 'jdg': 'Judges', 'rut': 'Ruth', 'sa1': '1 Samuel', 'sa2': '2 Samuel', 'kg1': '1 Kings', 'kg2': '2 Kings', 'ch1': '1 Chronicles', 'ch2': '2 Chronicles', 'ezr': 'Ezra', 'neh': 'Nehemiah', 'est': 'Esther', 'job': 'Job', 'psa': 'Psalms', 'pro': 'Proverbs', 'ecc': 'Ecclesiastes', 'sol': 'Song of Solomon', 'isa': 'Isaiah', 'jer': 'Jeremiah', 'lam': 'Lamentations', 'eze': 'Ezekiel', 'dan': 'Daniel', 'hos': 'Hosea', 'joe': 'Joel', 'amo': 'Amos', 'oba': 'Obadiah', 'jon': 'Jonah', 'mic': 'Micah', 'nah': 'Nahum', 'hab': 'Habakkuk', 'zep': 'Zephaniah', 'hag': 'Haggai', 'zac': 'Zechariah', 'mal': 'Malachi', 'es1': '1 Esdras', 'es2': '2 Esdras', 'tob': 'Tobias', 'jdt': 'Judith', 'aes': 'Additions to Esther', 'wis': 'Wisdom', 'bar': 'Baruch', 'epj': 'Epistle of Jeremiah', 'sus': 'Susanna', 'bel': 'Bel and the Dragon', 'man': 'Prayer of Manasseh', 'ma1': '1 Macabees', 'ma2': '2 Macabees', '

In [14]:
split_text = text.split("\n")

In [15]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [16]:
def embed_text(text: list[str], tokenizer, model):
    tokens = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens).last_hidden_state
    avg_pooled = output.mean(dim=1)
    return avg_pooled.tolist()[0]

In [17]:
def process_text(text: str):
    try:
        abrv_title = text.split("|")[0]
        title = bible_book_titles[abrv_title.lower()]
        chapter = text.split("|")[1]
        verse_num = text.split("|")[2]
        verse = text.split("|")[3]
        embeddings = embed_text([verse], tokenizer, model)
        print(title, chapter, verse_num)
        return {
            "title": title,
            "chapter": chapter,
            "verse_num": verse_num,
            "verse": verse,
            "embeddings": embeddings,
        }
    except Exception as e:
        print(e)
        return None

In [18]:
bible_data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
    results = list(executor.map(process_text, split_text))

Genesis 1 18
Genesis 2 1
Genesis 1 9
Genesis 1 31
Genesis 1 3
Genesis 2 6
Genesis 2 5
Genesis 1 17
Genesis 1 4
Genesis 2 4
Genesis 1 6
Genesis 1 27
Genesis 1 10
Genesis 2 3
Genesis 1 2
Genesis 1 8
Genesis 1 16
Genesis 1 5
Genesis 2 2
Genesis 2 8
Genesis 1 15
Genesis 1 7
Genesis 1 23
Genesis 1 22
Genesis 2 7
Genesis 1 29
Genesis 1 24
Genesis 1 14
Genesis 1 19
Genesis 1 13
Genesis 1 1
Genesis 2 9
Genesis 1 12
Genesis 1 20
Genesis 1 25
Genesis 1 28
Genesis 1 21
Genesis 1 11
Genesis 2 10
Genesis 1 30
Genesis 2 25
Genesis 1 26
Genesis 3 2
Genesis 2 12
Genesis 2 16
Genesis 2 22
Genesis 2 11
Genesis 2 13
Genesis 2 17
Genesis 3 10
Genesis 2 24
Genesis 2 18
Genesis 2 23
Genesis 2 21
Genesis 2 15
Genesis 3 20
Genesis 3 7
Genesis 2 14
Genesis 2 20
Genesis 3 9
Genesis 3 5
Genesis 3 21
Genesis 3 4
Genesis 3 19
Genesis 3 3
Genesis 3 18
Genesis 3 12
Genesis 3 24
Genesis 3 1
Genesis 3 11
Genesis 3 13
Genesis 3 8
GenesisGenesis 2 19
 3 15
Genesis 3 23
Genesis 3 14
Genesis 4 4
Genesis 3 16
Genesis 3 17


KeyboardInterrupt: 

In [None]:
bible_data = [result for result in results if result is not None]

[{'title': 'Genesis',
  'chapter': '1',
  'verse_num': '1',
  'verse': ' In the beginning God created the heaven and the earth.~',
  'embeddings': [0.04957013577222824,
   0.4607415497303009,
   -0.18745778501033783,
   -0.20404331386089325,
   0.12461750954389572,
   0.10255878418684006,
   0.2679595351219177,
   0.9159358143806458,
   0.1619112342596054,
   -1.1376513242721558,
   0.3548547625541687,
   -0.04654086381196976,
   0.09585069119930267,
   0.45811960101127625,
   -0.3105769157409668,
   0.19701802730560303,
   0.36587977409362793,
   0.5445023775100708,
   0.15987132489681244,
   0.2817395031452179,
   0.05586225539445877,
   0.0768887847661972,
   -0.5967788100242615,
   0.7039211988449097,
   0.5315967798233032,
   0.06214641407132149,
   -0.21405114233493805,
   0.1462945193052292,
   0.19688954949378967,
   -0.31930360198020935,
   0.1858138144016266,
   0.40027883648872375,
   0.02462582290172577,
   -0.4171249270439148,
   -0.12945511937141418,
   -0.021051900461316