# Bible Text Data
* Download, store, read, manipulate

In [1]:
import os
import json
import sefaria_code as sef


In [2]:
# Get multiple versions of the 5 books of the Torah:
for book_code in sef.book_code2web.keys():
    for version_code in sef.version_code2web.keys():
        local = sef.sefaria_local(book_code, version_code)
        url = sef.sefaria_url(book_code, version_code)
        if os.path.exists(local):
            print(f"-- Skipping existing {local}")
            continue
        sef.download_json_file(url, local, skip_fail=True)

-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.he.text_only.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.he.mesora.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.he.taamei.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.he.nikkud.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.en.jewish.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.en.modern.adam_cohn.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.en.new.jps1917.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\genesis.en.contemp.jps2006.json
-- Skipping existing c:\Users\Yonatan\Documents\coding_projects\new_c

In [3]:
# Read verses:
verses = sef.sefaria_read_content()
#verses = sefaria_read_content(only_book='leviticus', only_version='he.mesora')

++ 1533 from genesis (he.text_only)
++ 1533 from genesis (he.mesora)
++ 1533 from genesis (he.taamei)
++ 1533 from genesis (he.nikkud)
++ 146 from genesis (en.jewish)
++ 1533 from genesis (en.modern.adam_cohn)
++ 1533 from genesis (en.new.jps1917)
++ 1533 from genesis (en.contemp.jps2006)
++ 1533 from genesis (en.korean)
++ 1210 from exodus (he.text_only)
++ 1210 from exodus (he.mesora)
++ 1210 from exodus (he.taamei)
++ 1210 from exodus (he.nikkud)
-- Missing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\exodus.en.jewish.json
-- Missing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\exodus.en.modern.adam_cohn.json
++ 1210 from exodus (en.new.jps1917)
++ 1210 from exodus (en.contemp.jps2006)
++ 1210 from exodus (en.korean)
++ 859 from leviticus (he.text_only)
++ 859 from leviticus (he.mesora)
++ 859 from leviticus (he.taamei)
++ 859 from leviticus (he.nikkud)
-- Missing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\lev

In [4]:
print(verses[1])
print(verses[1000])
print(verses[10000])
print(verses[-10])

והארץ היתה תהו ובהו וחשך על פני תהום ורוח אלהים מרחפת על פני המים
ויבא חמור ושכם בנו אל שער עירם וידברו אל אנשי עירם לאמר
and said to the servant, “Who is that man walking in the field toward us?” And the servant said, “That is my master.” So she took her veil and covered herself.
and the Negev, and the plain; the valley of Yereĥo, city of the palm trees, as far as Żo῾ar.


## Model

In [5]:
# import torch_directml
# device = torch_directml.device()
device = None
device

In [108]:
from transformers import AutoTokenizer
import itertools
from transformers import GPT2Config, GPT2LMHeadModel
import torch
from transformers import Trainer, TrainingArguments
from transformers import pipeline
import sentencepiece as spm

USE_CUSTOM_TOK = True

def make_blocks(token_ids, block_size, device=None):
    for i in range(0, len(token_ids) - block_size, block_size):
        chunk = token_ids[i:i+block_size]
        chunk_tensor = torch.tensor(chunk, dtype=torch.long)
        chunk_dict = {
            'input_ids': chunk_tensor,
            'attention_mask': torch.ones_like(chunk_tensor),
            'labels': chunk_tensor.clone()
        }
        if device:
            chunk_dict = {k: v.to(device) for (k, v) in chunk_dict.items()}
        yield chunk_dict

def tokenize(tokenizer, texts, return_tensors=False):
    if USE_CUSTOM_TOK:
        encoded_texts = [tokenizer.encode(text) for text in texts]
        attention_mask = [[1 for token in line] for line in encoded_texts]
        enc_dict = {
            'input_ids': encoded_texts,
            'attention_mask': attention_mask,
        } 
    else:
        enc_dict = tokenizer(texts, truncation=False, padding=False)
    if return_tensors:
        enc_dict['input_ids'] = torch.tensor(enc_dict['input_ids'])
        enc_dict['attention_mask'] = torch.tensor(enc_dict['attention_mask'])
    return enc_dict
    
def generate(tokenizer, model, prompt, device=None, temperature=0.8):
    inputs = tokenize(tokenizer, [prompt], return_tensors=True)
    if device:
        inputs = inputs.to(device)
    outputs = model.generate(
        **inputs,
        max_length=500,
        max_new_tokens=50,
        do_sample=True,
        top_p=0.95,
        temperature=temperature
    )
    if USE_CUSTOM_TOK:
        outtext = tokenizer.decode(outputs[0].tolist())
    else:
        outtext = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return outtext

In [101]:
if USE_CUSTOM_TOK:
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.load("tokenizers/hebrew_spm.model")
    vocab_size = tokenizer.vocab_size()
else:
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    vocab_size = tokenizer.vocab_size
print(vocab_size)

16000


In [86]:
#encoded_verses = tokenizer(verses, truncation=False, padding=False)
encoded_verses = tokenize(tokenizer, verses)
all_ids = list(itertools.chain.from_iterable(encoded_verses['input_ids']))
len(all_ids)

1364789

In [87]:
trainset = list(make_blocks(all_ids, block_size=128, device=device))
trainset

[{'input_ids': tensor([  380, 11852,  9941,   976,   321,  2389,   475,   731,   683,  1006,
           6020,   279,   479,  1274,   719,  1274,   410,  9174,   453,  1451,
            479,  1274,   268,   410,  1142,   358,   976, 10361,  1850,   453,
           1451,  3286,   534,   976,   392,   279,   297,   789,   273,   957,
            789,   273,  1821,   976,   321,  1163,   294,   273,   447,  3137,
            837,  9348,   976,  2321,   275,  1163,   294,   273,  2017,   275,
            342,  9174,  1849,   976,   427,   294,   273,  2112,  1057,  9174,
           5670,  6826,   957,  6800,   957,  1968,  2112,  1090,   534,   976,
            392,   279,   297,  2916,  1839,  2016,   429,  3286,   957,  2483,
            302,  1335,  2321,   275,  2972,  1919,   316,  1459,   976,   321,
           1822,   335,  1839,   837,  9348,  2321,   275,  3286,   369,  5892,
            327,   273,   335,  1839,  2017,   275,  3286,   369,  2092,   327,
            273,   335,  18

In [88]:
trainset[0]['input_ids']

tensor([  380, 11852,  9941,   976,   321,  2389,   475,   731,   683,  1006,
         6020,   279,   479,  1274,   719,  1274,   410,  9174,   453,  1451,
          479,  1274,   268,   410,  1142,   358,   976, 10361,  1850,   453,
         1451,  3286,   534,   976,   392,   279,   297,   789,   273,   957,
          789,   273,  1821,   976,   321,  1163,   294,   273,   447,  3137,
          837,  9348,   976,  2321,   275,  1163,   294,   273,  2017,   275,
          342,  9174,  1849,   976,   427,   294,   273,  2112,  1057,  9174,
         5670,  6826,   957,  6800,   957,  1968,  2112,  1090,   534,   976,
          392,   279,   297,  2916,  1839,  2016,   429,  3286,   957,  2483,
          302,  1335,  2321,   275,  2972,  1919,   316,  1459,   976,   321,
         1822,   335,  1839,   837,  9348,  2321,   275,  3286,   369,  5892,
          327,   273,   335,  1839,  2017,   275,  3286,   369,  2092,   327,
          273,   335,  1839,   957,  1325,  1849,   976,   327])

In [89]:
print(f"Vocab size: {vocab_size}")

Vocab size: 16000


In [102]:
config = GPT2Config(
    vocab_size=vocab_size,
    n_positions=1024,
    n_ctx=1024,
    n_embd=256,
    n_layer=8,
    n_head=8
)
model = GPT2LMHeadModel(config)
#model.to(device)

print(f"Model has {model.num_parameters()} parameters. Model device: {model.device}")
model

Model has 10676736 parameters. Model device: cpu


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(16000, 256)
    (wpe): Embedding(1024, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPT2Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=768, nx=256)
          (c_proj): Conv1D(nf=256, nx=256)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=1024, nx=256)
          (c_proj): Conv1D(nf=256, nx=1024)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=256, out_features=16000, bias=False)
)

In [103]:
# Inference with fresh new model
prompt1 = "ויאמר אברהם"
prompt2 = "בראשית ברא"

print(generate(tokenizer, model, prompt1, device=device))
print(generate(tokenizer, model, prompt2, device=device))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ויאמר אברהם -> torch.Size([1, 2])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ויאמר אברהםֽח־ל אֶחָ֑תכלתֽח־לֽח־לֽט׃ֽט׃ וַיִּכְתֹּ בבא אֶת־הַחֲמֹרִ אֶת־הַחֲמֹרִ none כׇּל־בְּכוֹר לְבָךְ׃ seventeen גְּדִיֹ֥אכַל בעבראבנים נוֹדַלְבַבְכֶxceptekנָשִׂיא numerous קַֽעֲל־הַשֹּׁפֵ הָאֵֽלֶּה׃ וְגֵרַשְׁתִּמֶֽלֶךְ־הַבָּשָׁאָחִינוּיִרְאֶ enslave שָׁנִיםף־רַגְלְכֶ וְעָל המזבח אֵימָIt וַיְבָבָּנָֽיו׃ כַּפְתֹּרֶha בְעוֹר־בְּשָׂר הַיָּמִ֜ שְׁתֵּי Itamarלך
בראשית ברא -> torch.Size([1, 3])
בראשית ברא Aַל לָקַח צַעֲק לְמֶ ראיתי flow flow מַטּוֹ־בַשְּׁתִ הָֽ אֹ֤ מַמְאֶ־יַעֲקֹב loudְנֵֽי־יִשְׂרָאֵלִֽהְיֹתלִֽהְיֹת warז Noaĥ הַמַּלְאָכִ else� המדיני לְחַיֵּי־נֹ וַנֹּ וַיִּ֥בֶ וְנֶאֶסְפ כֶּֽ cloth אֶל־בִּלְעָם אַשְׁחִ Amram המנרה ותר בֶּ ללוים וחמש לְבָ ride� forever בָּאַגָּנֹיֵשְׁב strange יֻלְּד הִפְרִ עַד pursui


## Train model

In [14]:

# DML = torch_directml.device()

# class DMLTrainer(Trainer):
#     def _move_model_to_device(self, model, device):
#         # Force DirectML device
#         return model.to(DML)

#     def _prepare_inputs(self, inputs):
#         # Move all batch tensors to DirectML
#         for k, v in inputs.items():
#             if hasattr(v, "to"):
#                 inputs[k] = v.to(DML)
#         return inputs

In [30]:
print(f"Before setting trainint_args, model device is {next(model.parameters()).device}")
training_args = TrainingArguments(
    output_dir="./models/tiny-gpt-8layers-torah-books-2",
    per_device_train_batch_size=8,
    num_train_epochs=4,
    learning_rate=5e-4,
    warmup_steps=100,
    logging_steps=50,
    save_strategy="steps",
    save_steps=200
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset
)
trainer.train()


Before setting trainint_args, model device is cpu


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,8.987
100,7.3585
150,6.6125
200,6.2635
250,6.0312
300,5.7928
350,5.6671
400,5.5395
450,5.3706
500,5.3444


TrainOutput(global_step=5332, training_loss=4.288268165845936, metrics={'train_runtime': 7612.6915, 'train_samples_per_second': 5.602, 'train_steps_per_second': 0.7, 'total_flos': 206957039321088.0, 'train_loss': 4.288268165845936, 'epoch': 4.0})

In [37]:
# Try inference after model is trained:
prompt1 = "ויאמר אברהם"
prompt2 = "בראשית ברא"

print(generate(tokenizer, model, prompt1, device=device))
print(generate(tokenizer, model, prompt2, device=device))
print(generate(tokenizer, model, "וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ", device=device))
print(generate(tokenizer, model, "וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ נֶחְמָ֥ד לְמַרְאֶ֖ה וְט֣וֹב לְמַאֲכָ֑ל", device=device))
print(generate(tokenizer, model, "וַתֵּרֶד בַּת־פַּרְעֹה לִרְחֹץ עַל־הַיְאֹר וְנַעֲרֹתֶיהָ הֹלְכֹת עַל־יַד הַיְאֹר וַתֵּרֶא אֶת־הַתֵּבָה בְּתוֹךְ", device=device))
print(generate(tokenizer, model, "And the serpent", device=device))
print(generate(tokenizer, model, "In the beginning", device=device))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ויאמר אברהם ויאמר אברהם אל אבי אברהם ויאמר אליו אמר אל בני ישראל ואמר אל אל יעקב עבדתם הוא בשלחו ויאמר משה אל אברם עשו אליו אשר צוה יעשו ויאמר משה יהוה אלה


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


בראשית ברא והיתה לא ידו את כל מלאכתת את הנערה ולקחו אל בני ישראל ואת בנימן לא תשמר את החקים אתת ועזים החטאת האחד לאה אשר תעשו


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כּל־עֵ֛ץ וַיִּתֵּ֥ן ל֛וֹ כָּל־הַקִּנָּשִׁ֖ים לִשְׁמֹ֑שׁ וַיֹּ֥אמֶר ל֖וֹ אֶל־יוֹסֵ֥


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כּל־עֵ֛ץ נֶחְמָ֥ד לְמַרְאֶ֖ה וְט֣וֹב לְמַאֲכָ֑ל וַיֹּאמְר֤וּ אֱלֹהֵיכֶם֙ שֵׁ֣שׁ אֶל־בִּלְעָם֙ לֵאמֹ֔ר לָ֤נוּ וְשָׁ


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַתֵּרֶד בַּת־פַּרְעֹה לִרְחֹץ עַל־הַיְאֹר וְנַעֲרֹתֶיהָ הֹלְכֹת עַל־יַד הַיְאֹר וַתֵּרֶא אֶת־הַתֵּבָה בְּתוֹךְ וַיְהִי כַפְּרִיהֵ עֵשׂוֹר בַּתֵּשׁ הַיָּשִׁעוּעַת יְהוָה וַתִּתְּנֶכֶת ל


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


And the serpent And he gave the children of Yisra ̓el, and he gave the son of Yiżĥaq, and Ya ̔aqov, and he thy brothers, that thou comesthood to him, that he should
In the beginning that man were in all the land of the city, for the cattle, and the famine was in the land of Egypt; and the land of all that he were with it. But when the LORD was dead forth


## Loading a model from checkpoint

In [76]:
os.listdir('./models')

['tiny-gpt-8layers-torah-books',
 'tiny-gpt-8layers-torah-books-2',
 'tiny-gpt-trial1',
 'tiny-gpt-trial2']

In [57]:
os.listdir("./models/tiny-gpt-8layers-torah-books/checkpoint-2400")

['config.json',
 'generation_config.json',
 'model.safetensors',
 'optimizer.pt',
 'rng_state.pth',
 'scheduler.pt',
 'trainer_state.json',
 'training_args.bin']

In [77]:

checkpoint_path = "models/tiny-gpt-8layers-torah-books/checkpoint-2400"
print(checkpoint_path)

model_cp = GPT2LMHeadModel.from_pretrained(checkpoint_path, local_files_only=True)
tokenizer_cp = AutoTokenizer.from_pretrained("xlm-roberta-base")
print(f"Model vacab size: {model_cp.config.vocab_size}")
print(f"Tokenizer vocab : {len(tokenizer_cp)}")

models/tiny-gpt-8layers-torah-books/checkpoint-2400
Model vacab size: 250002
Tokenizer vocab : 250002


In [78]:
prompt1 = "ויאמר אברהם"
prompt2 = "בראשית ברא"
device = None

print(generate(tokenizer_cp, model_cp, prompt1, device=device))
print(generate(tokenizer_cp, model_cp, prompt2, device=device))
print(generate(tokenizer_cp, model_cp, "וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ", device=device))
print(generate(tokenizer_cp, model_cp, "וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ נֶחְמָ֥ד לְמַרְאֶ֖ה וְט֣וֹב לְמַאֲכָ֑ל", device=device))
print(generate(tokenizer_cp, model_cp, "וַתֵּרֶד בַּת־פַּרְעֹה לִרְחֹץ עַל־הַיְאֹר וְנַעֲרֹתֶיהָ הֹלְכֹת עַל־יַד הַיְאֹר וַתֵּרֶא אֶת־הַתֵּבָה בְּתוֹךְ", device=device))
print(generate(tokenizer_cp, model_cp, "And the serpent", device=device))
print(generate(tokenizer_cp, model_cp, "In the beginning", device=device))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ויאמר אברהם ויאמר יהוה וַיֹּר יהוָה יְהוָה מוֹעֵשֶׁה עֶל־הַגְּבוּם לִי וָאֵלֶם אֲשֶׁר צ


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


בראשית ברא ויאמר יהוה אלהיך ונער על לא תמחנה יהוה אלהיך ואל האיש לשמר ואת יהוה אלהיך את היהוה ואתכם המצאם את עשו יהוה


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כּל־עֵ֛ץ בַּבָּ֖ה הַזַּֽחַֽיִם׃ וְהַמַּ֣ יַיהוָשִׁ֔ים בַּקְמַ֥ר אֵלָ֛


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כּל־עֵ֛ץ נֶחְמָ֥ד לְמַרְאֶ֖ה וְט֣וֹב לְמַאֲכָ֑ל וְהִתְּחַ֖ה מֹשִֽׁים׃ וַיִּקְרָ֣ה לֹ֑ר כֵּ֤א יְהוָה֙ נֶ֣אמֶר יְעְ


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַתֵּרֶד בַּת־פַּרְעֹה לִרְחֹץ עַל־הַיְאֹר וְנַעֲרֹתֶיהָ הֹלְכֹת עַל־יַד הַיְאֹר וַתֵּרֶא אֶת־הַתֵּבָה בְּתוֹךְ וְאֶת־עָד יִהְיָה וַבְּרָאֵל׃ וַיְהוָה אֱלֹהֶם נִקְבְרְיְהְ


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


And the serpent And he said: ‘I will go; and the land of the land of the hand of thee, and I will speak: and they are this, that they are no thing. And the land shall do his fathers;
In the beginning And the Lord spoke to ̓el, saying: ‘I will call your father’s, and he were flesh. And I should come up from the land of the sons. And I sent from Egypt, and said


## Load a model that was trained for a custom tokenizer:

In [82]:
os.listdir("models/tiny-gpt-8layers-torah-books-2/")

['checkpoint-1000',
 'checkpoint-1200',
 'checkpoint-1400',
 'checkpoint-1600',
 'checkpoint-1800',
 'checkpoint-200',
 'checkpoint-2000',
 'checkpoint-2200',
 'checkpoint-2400',
 'checkpoint-2600',
 'checkpoint-2800',
 'checkpoint-3000',
 'checkpoint-3200',
 'checkpoint-3400',
 'checkpoint-3600',
 'checkpoint-3800',
 'checkpoint-400',
 'checkpoint-4000',
 'checkpoint-4200',
 'checkpoint-4400',
 'checkpoint-4600',
 'checkpoint-4800',
 'checkpoint-5000',
 'checkpoint-5200',
 'checkpoint-5332',
 'checkpoint-600',
 'checkpoint-800']

In [107]:
checkpoint_path = "models/tiny-gpt-8layers-torah-books-2/checkpoint-5200"
print(checkpoint_path)

model_cp = GPT2LMHeadModel.from_pretrained(checkpoint_path, local_files_only=True)
tokenizer_cp = spm.SentencePieceProcessor()
tokenizer_cp.load("tokenizers/hebrew_spm.model")
print(f"Model vacab size: {model_cp.config.vocab_size}")
print(f"Tokenizer vocab : {len(tokenizer_cp)}")
print("Now, change the flag USE_CUSTOM_TOK to True and rerun loading the tokenizer...")

models/tiny-gpt-8layers-torah-books-2/checkpoint-5200
Model vacab size: 16000
Tokenizer vocab : 16000
Now, change the flag USE_CUSTOM_TOK to True and rerun loading the tokenizer...


In [109]:
prompt1 = "ויאמר אברהם"
prompt2 = "בראשית ברא"
device = None

print(generate(tokenizer_cp, model_cp, prompt1, device=device))
print(generate(tokenizer_cp, model_cp, prompt2, device=device))
print(generate(tokenizer_cp, model_cp, "וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ", device=device))
print(generate(tokenizer_cp, model_cp, "וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ נֶחְמָ֥ד לְמַרְאֶ֖ה וְט֣וֹב לְמַאֲכָ֑ל", device=device))
print(generate(tokenizer_cp, model_cp, "וַתֵּרֶד בַּת־פַּרְעֹה לִרְחֹץ עַל־הַיְאֹר וְנַעֲרֹתֶיהָ הֹלְכֹת עַל־יַד הַיְאֹר וַתֵּרֶא אֶת־הַתֵּבָה בְּתוֹךְ", device=device))
print(generate(tokenizer_cp, model_cp, "And the serpent", device=device))
print(generate(tokenizer_cp, model_cp, "In the beginning", device=device))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ויאמר אברהם את משה אל משה ויאמר יעקב ויאמר אל אהרן ויאמר אליו אל משה אלהי אברהם איש איש אל משה ויאמר את הארץ ויאמר אל נא נא תרגאד ויאמר שם אברהם ויאמר יצחק אל אברם אל המקום אשר לא ישמגן כי הל וי


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


בראשית ברא את דברי יהוה אל משה ואל בניו אל משה אל משה ועל פני כל והיו לפני יהוה אל יהוה אל משה אל בני ישראל ואל אהל מועד ועל המשכן ויהושע בן נון ומעלה על פי יהוה אל אהרן ואת והיה קדש ומעלה ארץ כנען


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ יְהוָ֖ה יְהוָ֖ה אֱלֹהֵיכֶֽם׃ וַיֹּ֤אמֶר יְהוָה֙ אֶל־בְּנֵ֣י יִשְׂרָאֵ֔ל וְאָמַרְתָּ֣ אֲלֵהֶ֑ם אֲשֶׁ֥ר תִּרְאֶֽה׃ וַיֹּ֨אמֶר יְהוָ֜ה אֶל־מֹשֶׁ֗ה אֶל־מֹשֶׁ֥ה לֵּאמֹֽר׃ דַּבֵּ֛ר צִוָּ֥ה יְהוָ֖ה אֶל־מֹשֶׁ֑ה וַיְדַבֵּ֥ר יְהוָ֖ה אֶל־מֹשֶׁ֥ה לֵּאמֹֽר׃ יְהֹוָה֙ אֶל־מֹשֶׁ


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַיַּצְמַ֞ח יְהֹוָ֤ה אֱלֹהִים֙ מִן־הָ֣אֲדָמָ֔ה כׇּל־עֵ֛ץ נֶחְמָ֥ד לְמַרְאֶ֖ה וְט֣וֹב לְמַאֲכָ֑ל וְנִפְפּוֹדְשָׁ֖יו יִקְפָּד֑וֹן יְהֹוָה֙ אֶת־עִ֖יר אֲשֶׁ֣ר יִמְלַמְנַ֙ לֹ֣א תִדְּמֹּ֔אן וְלֹ֤א תֻֽחַ


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


וַתֵּרֶד בַּת־פַּרְעֹה לִרְחֹץ עַל־הַיְאֹר וְנַעֲרֹתֶיהָ הֹלְכֹת עַל־יַד הַיְאֹר וַתֵּרֶא אֶת־הַתֵּבָה בְּתוֹךְ עִירוֹ שְׁלָלֵן׃ וַתֵּרֶא׃ וַיֹּאמֶר אֲבִימֶלֶךְ הַנַּעַר כִּי־בֵא וַיֹּאמֶר יִצְחָק וַיָּשֶׁב יִשְׂרָאֵל וְהִנֵּה יֶנִי בְּאֶרֶץ הַשָּׁמַיִם וַיַּרְא אַבְרָהָם וַיֹּאמֶר לוֹ יַעֲקֹב וַתֹּאמֶר אַבְרָהָם וַיֹּאמֶר אַבְרָם לֵאמֹר כֹּה וַיֵּלֶךְ׃ וַיֹּאמֶר יְהוָה וַיֹּאמֶר בָּלָק 


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=500) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


And the serpents, and the plague of the plague was tip of his hand, that was the priest of the house of the hand of his hand, wherewith he was in to been in the tent. And he said: ‘Let the plague be
In the beginning of the first year, shall be a measure of an offering for a meal offering; of a meal offering made by fire to the Lord. And every burnt offering of a sacrifice of the Lord: and its meal offering of the Lord commanded it shal
