# Train a tokenizer


In [5]:
import os
import json
import sefaria_code as sef
import numpy as np

## Prepare training data for tokenization

In [21]:
def prepare_tokenizer_trainset(outfile):
    verses = sef.sefaria_read_content()
    n = len(verses)
    ratio = 0.8
    n_sample = int(n*ratio)
    inds = np.random.choice(n, n_sample, replace=False)
    sample = [verses[ind] for ind in inds]
    big_text = '\n'.join(sample)
    # Write to file:
    with open(outfile, 'w', encoding='utf-8') as f:
        f.write(big_text)

    print(f"Wrote {outfile}")
    return


In [26]:
token_trainset_file = os.path.abspath(f'./tokenizers/sefaria_token_trainset.1.txt')
print(token_trainset_file)

c:\Users\Yonatan\Documents\coding_projects\new_computer\tokenizers\sefaria_token_trainset.1.txt


In [28]:
prepare_tokenizer_trainset(token_trainset_file)

++ 1533 from genesis (he.text_only)
++ 1533 from genesis (he.mesora)
++ 1533 from genesis (he.taamei)
++ 1533 from genesis (he.nikkud)
++ 146 from genesis (en.jewish)
++ 1533 from genesis (en.modern.adam_cohn)
++ 1533 from genesis (en.new.jps1917)
++ 1533 from genesis (en.contemp.jps2006)
++ 1533 from genesis (en.korean)
++ 1210 from exodus (he.text_only)
++ 1210 from exodus (he.mesora)
++ 1210 from exodus (he.taamei)
++ 1210 from exodus (he.nikkud)
-- Missing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\exodus.en.jewish.json
-- Missing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\exodus.en.modern.adam_cohn.json
++ 1210 from exodus (en.new.jps1917)
++ 1210 from exodus (en.contemp.jps2006)
++ 1210 from exodus (en.korean)
++ 859 from leviticus (he.text_only)
++ 859 from leviticus (he.mesora)
++ 859 from leviticus (he.taamei)
++ 859 from leviticus (he.nikkud)
-- Missing c:\Users\Yonatan\Documents\coding_projects\new_computer\data\sefaria\lev

## Read tokenization train set:

In [29]:
with open(token_trainset_file, 'r', encoding='utf-8') as fid:
    train_text = fid.read()

print(train_text[:1000])
print(train_text[-1000:])

When thy son asketh thee in time to come, saying: ‘What mean the testimonies, and the statutes, and the ordinances, which the LORD our God hath commanded you?
וּמֵאָ֞ז בָּ֤אתִי אֶל־פַּרְעֹה֙ לְדַבֵּ֣ר בִּשְׁמֶ֔ךָ הֵרַ֖ע לָעָ֣ם הַזֶּ֑ה וְהַצֵּ֥ל לֹא־הִצַּ֖לְתָּ אֶת־עַמֶּֽךָ׃
and ye said: ‘Behold, the LORD our God hath shown us His glory and His greatness, and we have heard His voice out of the midst of the fire; we have seen this day that God doth speak with man, and he liveth.
וַתַּעֲנוּ אֹתִי וַתֹּאמְרוּ טוֹב־הַדָּבָר אֲשֶׁר־דִּבַּרְתָּ לַעֲשׂוֹת׃
And it came to pass in the first month in the second year, on the first day of the month, that the tabernacle was reared up.
Blessed shalt thou be in the city, and blessed shalt thou be in the field.
those that were numbered of them, of the tribe of Menashshe, were thirty two thousand, two hundred.
And they said one to another: ‘Let us make a captain, and let us return into Egypt.’
In the same day thou shalt give him his hire, neither shall 

## Train a tokenizer:

In [25]:
import sentencepiece as spm

In [54]:


spm.SentencePieceTrainer.Train(
    input=token_trainset_file,
    model_prefix="tokenizers/hebrew_spm",
    vocab_size=16000,
    model_type="unigram",
    character_coverage=1.0,
    normalization_rule_name="nmt_nfkc",
    byte_fallback=True,
    split_by_unicode_script=True,
    split_by_whitespace=True,
    split_by_number=True
)

## Load a tokenizer and use it:

In [55]:
sp = spm.SentencePieceProcessor()
sp.load("tokenizers/hebrew_spm.model")

True

In [56]:
print(sp.encode("And thou shalt make them linen breeches to cover their nakedness; from the loins even to the thighs they shall reach:"))
print(sp.encode("כִּ֛י לֹא־יֶחְדַּ֥ל אֶבְי֖וֹן מִקֶּ֣רֶב הָאָ֑רֶץ עַל־כֵּ֞ן אָנֹכִ֤י מְצַוְּךָ֙ לֵאמֹ֔ר פָּ֠תֹחַ תִּפְתַּ֨ח אֶת־יָדְךָ֜ לְאָחִ֧יךָ לַעֲנִיֶּ֛ךָ וּלְאֶבְיֹנְךָ֖ בְּאַרְצֶֽךָ׃ (ס)"))
print(sp.encode("כי לא-יחדל אביון מקרב הארץ על-כן אנכי מצוך לאמר פתח תפתח את-ידך לאחיך לעניך ולאבינך בארצך: (ס)"))

[271, 319, 269, 314, 449, 304, 1318, 263, 444, 8833, 266, 1747, 331, 1577, 265, 326, 259, 5030, 691, 266, 259, 2247, 276, 310, 269, 270, 3728, 281]
[640, 987, 2397, 4211, 1962, 4116, 263, 5639, 386, 275, 2249, 448, 892, 263, 3436, 459, 2864, 660, 275, 1138, 582, 2315, 267, 1538, 620, 263, 1317, 802, 277, 4112, 1289, 3894, 2342, 7488, 439, 6180, 633, 506, 12576, 384, 339, 2727, 575, 1176, 5639, 352, 442, 931, 5340, 1310, 482, 338, 483]
[447, 427, 293, 297, 358, 302, 291, 2360, 275, 4697, 731, 453, 293, 499, 275, 1540, 3623, 429, 822, 263, 1990, 479, 1990, 321, 293, 1775, 429, 327, 10462, 327, 336, 783, 429, 679, 305, 297, 2878, 380, 5901, 281, 482, 338, 483]


In [40]:
sp.decode([384, 731, 2141, 3955, 1706, 3860])

'כִּ֛י לֹא־יֶחְדַּ֥ל'

In [44]:
sp.decode([191, 171, 37, 41])

'כי לא-י'

In [57]:
from transformers import AutoTokenizer

robtok = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [58]:
print(robtok.encode("And thou shalt make them linen breeches to cover their nakedness; from the loins even to the thighs they shall reach:"))
print(robtok.encode("כִּ֛י לֹא־יֶחְדַּ֥ל אֶבְי֖וֹן מִקֶּ֣רֶב הָאָ֑רֶץ עַל־כֵּ֞ן אָנֹכִ֤י מְצַוְּךָ֙ לֵאמֹ֔ר פָּ֠תֹחַ תִּפְתַּ֨ח אֶת־יָדְךָ֜ לְאָחִ֧יךָ לַעֲנִיֶּ֛ךָ וּלְאֶבְיֹנְךָ֖ בְּאַרְצֶֽךָ׃ (ס)"))
print(robtok.encode("כי לא-יחדל אביון מקרב הארץ על-כן אנכי מצוך לאמר פתח תפתח את-ידך לאחיך לעניך ולאבינך בארצך: (ס)"))

[0, 3493, 14461, 34, 91, 23766, 3249, 2856, 96, 2217, 12562, 13, 17007, 47, 29256, 2363, 24, 12225, 7432, 74, 1295, 70, 54508, 7, 3853, 47, 70, 5675, 6958, 7, 1836, 35299, 58359, 12, 2]
[0, 3496, 228529, 246385, 710, 657, 192159, 15274, 710, 34192, 3149, 17698, 2648, 31467, 24843, 244131, 1644, 1470, 34192, 1982, 17698, 710, 244154, 45259, 1211, 198086, 4484, 31467, 34192, 243997, 2442, 34192, 1982, 364, 24001, 29771, 245020, 2442, 34192, 7611, 5332, 165607, 15274, 4944, 218675, 248805, 1211, 132025, 64098, 4944, 49597, 246670, 710, 874, 17698, 8300, 24843, 905, 134994, 4667, 24001, 245464, 657, 38678, 98229, 64098, 244782, 2442, 41764, 24001, 3, 609, 64098, 3149, 24843, 3947, 228529, 7201, 17698, 124470, 24843, 247423, 3149, 220460, 15274, 710, 24001, 2648, 17698, 4667, 24001, 248790, 177965, 29771, 3149, 49597, 247715, 18167, 24001, 657, 201542, 106044, 206872, 710, 31467, 34192, 246385, 4667, 24001, 6, 33809, 1644, 17698, 2808, 34192, 1982, 17698, 710, 64098, 5666, 17698, 4667, 2400

In [59]:
def compare_tokenizer_encodings(tok1, tok2, text):
    print("-"*40)
    print(text)
    enc1 = tok1.encode(text)
    enc2 = tok2.encode(text)
    print(f"{len(enc1)}) {enc1}")
    print(f"{len(enc2)}) {enc2}")


In [60]:
compare_tokenizer_encodings(robtok, sp, "And thou shalt")
compare_tokenizer_encodings(robtok, sp, "In the beginning god created the heaven and the earth")
compare_tokenizer_encodings(robtok, sp, "בראשית ברא אלוהים את השמים ואת הארץ")
compare_tokenizer_encodings(robtok, sp, "כי לא-יחדל אביון מקרב הארץ על-כן אנכי מצוך לאמר פתח תפתח את-ידך לאחיך לעניך ולאבינך בארצך: (ס)")
compare_tokenizer_encodings(robtok, sp, "כי לא-יחדל")
compare_tokenizer_encodings(robtok, sp, "כִּ֛י לֹא־יֶחְדַּ֥ל")

----------------------------------------
And thou shalt
7) [0, 3493, 14461, 34, 91, 23766, 2]
4) [271, 319, 269, 314]
----------------------------------------
In the beginning god created the heaven and the earth
12) [0, 360, 70, 86595, 2355, 75935, 70, 199320, 136, 70, 109270, 2]
10) [1646, 259, 5028, 4248, 3185, 259, 949, 261, 259, 519]
----------------------------------------
בראשית ברא אלוהים את השמים ואת הארץ
13) [0, 6, 202068, 564, 45629, 6, 173824, 689, 364, 121877, 39956, 55202, 2]
11) [380, 11852, 9941, 391, 294, 279, 316, 321, 2389, 475, 731]
----------------------------------------
כי לא-יחדל אביון מקרב הארץ על-כן אנכי מצוך לאמר פתח תפתח את-ידך לאחיך לעניך ולאבינך בארצך: (ס)
45) [0, 4074, 1157, 9, 65405, 55546, 44438, 2884, 874, 101146, 55202, 895, 9, 12984, 44458, 15949, 138733, 4667, 1157, 18553, 106683, 3947, 33043, 689, 9, 15435, 4667, 1157, 3149, 18167, 657, 76632, 4667, 14514, 1982, 24295, 4667, 9356, 29025, 4667, 12, 15, 2702, 16, 2]
43) [447, 427, 293, 297, 358, 302,