In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl

In [18]:
#read in data/genesis_english.txt
with open('data/genesis_english.txt', 'r') as f:
    genesis_english = f.read()

#get the number of unique tokens in the text
tokens = set(genesis_english.split())
n_tokens = len(tokens)
n_tokens

4282

In [19]:
# Load the data
heb_df = pd.read_excel('data/torah.xlsb')

In [20]:
heb_df = heb_df[['Ref', 'uLemma']]
heb_df

Unnamed: 0,Ref,uLemma
0,01_Gn 01:01.01.4,בּ
1,01_Gn 01:01.01.5,רֵאשִׁית
2,01_Gn 01:01.02.5,בּרא
3,01_Gn 01:01.03.5,אֱלֹהִים
4,01_Gn 01:01.04.5,אֵת
...,...,...
111966,05_Dt 34:12.10.4,ל
111967,05_Dt 34:12.10.5,עַיִן
111968,05_Dt 34:12.11.5,כֹּל
111969,05_Dt 34:12.12.5,יִשְׂרָאֵל


In [21]:
#drop last row
heb_df = heb_df.iloc[:-1]
#remove all rows where ref doesn not start with 01
heb_df = heb_df[heb_df['Ref'].str.startswith('01')]
#create a stopword dataframe with all the rows with the last character of ref is >5
stopwords = heb_df[heb_df['Ref'].str[-1] != '5']
heb_words = heb_df[heb_df['Ref'].str[-1] == '5']
#print the unigue values of uLemma in the stopwords dataframe
print(heb_words['uLemma'].unique().shape)
print(stopwords['uLemma'].unique().shape)

(1744,)
(11,)


In [22]:
#a list of hebrew prepositions
print(stopwords['uLemma'].unique())
#print the top most frquent elements in ulemma from heb_words
print(heb_words['uLemma'].value_counts().head(20))
#remove , ל, אֵת, אֶל, עַל
heb_words = heb_words[~heb_words['uLemma'].isin([',', 'ל', 'אֵת', 'אֶל', 'עַל'])]


['בּ' 'ה' 'ו' 'ל/ה' 'מִן' 'ל' 'בּ/ה' 'ךּ' 'הֲ' 'שֶׁ' 'כּ/ה']
אֵת         1147
אמר          606
אֶל          475
ל            418
אֲשֶׁר       411
בֵּן         365
כֹּל         342
היה          316
אֶרֶץ        311
עַל          306
כִּי         290
אֱלֹהִים     219
בּוא         217
לֹא          212
אָב          208
יַעֲקֹב      180
אָח          178
ילד          170
יהוה         165
שָׁנָה       161
Name: uLemma, dtype: int64


In [23]:
#create a unique string for each unique word in uLemma
unique_words = heb_words['uLemma'].unique()

pad_length = len(str(len(unique_words)))  # Calculates the number of digits to pad
heb_enc_padded = {f"E{str(i).zfill(pad_length)}": word for i, word in enumerate(unique_words)}

#add a column to heb_words with the encoded values
heb_words['enc'] = heb_words['uLemma'].map({v: k for k, v in heb_enc_padded.items()})
heb_words


Unnamed: 0,Ref,uLemma,enc
1,01_Gn 01:01.01.5,רֵאשִׁית,E0000
2,01_Gn 01:01.02.5,בּרא,E0001
3,01_Gn 01:01.03.5,אֱלֹהִים,E0002
6,01_Gn 01:01.05.5,שָׁמַיִם,E0003
10,01_Gn 01:01.07.5,אֶרֶץ,E0004
...,...,...,...
28533,01_Gn 50:26.06.5,שָׁנָה,E0054
28535,01_Gn 50:26.07.5,חנט,E1727
28538,01_Gn 50:26.09.5,שׂים,E0128
28540,01_Gn 50:26.10.5,אֲרוֺן,E1739


In [24]:
tokens = set(heb_df['uLemma'])
len(tokens)

1751

In [12]:
#remove last 5 characters from ref
heb_words['Ref'] = heb_words['Ref'].str[:-5]
#concatenate the uLemma and enc values for each ref
heb_lines = heb_words.groupby('Ref')['uLemma'].apply(' '.join).reset_index()
#replace the first 5 characters of ref with 'Genesis'
heb_lines['Ref'] = heb_lines['Ref'].str.replace('01_Gn', 'Genesis')
heb_lines

Unnamed: 0,Ref,uLemma
0,Genesis 01:01,רֵאשִׁית בּרא אֱלֹהִים שָׁמַיִם אֶרֶץ
1,Genesis 01:02,אֶרֶץ היה תֹּהוּ בֹּהוּ חֹשֶׁךְ פָּנֶה תְּהוֺם...
2,Genesis 01:03,אמר אֱלֹהִים היה אוֺר היה אוֺר
3,Genesis 01:04,ראה אֱלֹהִים אוֺר כִּי טוֺב בּדל אֱלֹהִים בַּי...
4,Genesis 01:05,קרא אֱלֹהִים אוֺר יוֺם חֹשֶׁךְ קרא לַיְלָה היה...
...,...,...
1528,Genesis 50:22,ישׁב יוֺסֵף מִצְרַיִם הוּא בַּיִת אָב חיה יוֺס...
1529,Genesis 50:23,ראה יוֺסֵף אֶפְרַיִם בֵּן שִׁלֵּשִׁים גַּם בֵּ...
1530,Genesis 50:24,אמר יוֺסֵף אָח אָנֹכִי מות אֱלֹהִים פּקד פּקד ...
1531,Genesis 50:25,שׁבע יוֺסֵף בֵּן יִשְׂרָאֵל אמר פּקד פּקד אֱלֹ...


In [8]:
heb_lines_enc = heb_words.groupby('Ref')['enc'].apply(' '.join).reset_index()
heb_lines_enc['Ref'] = heb_lines_enc['Ref'].str.replace('01_Gn', 'Genesis')
heb_lines_enc

Unnamed: 0,Ref,enc
0,Genesis 01:01,E0000 E0001 E0002 E0003 E0004
1,Genesis 01:02,E0004 E0005 E0006 E0007 E0008 E0009 E0010 E001...
2,Genesis 01:03,E0014 E0002 E0005 E0015 E0005 E0015
3,Genesis 01:04,E0016 E0002 E0015 E0017 E0018 E0019 E0002 E002...
4,Genesis 01:05,E0021 E0002 E0015 E0022 E0008 E0021 E0023 E000...
...,...,...
1528,Genesis 50:22,E0287 E1211 E0495 E0141 E0376 E0182 E0243 E121...
1529,Genesis 50:23,E0016 E1211 E1543 E0226 E1737 E0196 E0226 E173...
1530,Genesis 50:24,E0014 E1211 E0256 E0209 E0161 E0002 E0915 E091...
1531,Genesis 50:25,E0931 E1211 E0226 E1302 E0014 E0915 E0915 E000...


In [9]:
#remove ref column
heb_lines = heb_lines.drop(columns='Ref')
heb_lines_enc = heb_lines_enc.drop(columns='Ref')
#remove inndex and save to txt file
heb_lines.to_csv('data/genesis_hebrew.txt', sep='\t', index=False, header=False)
heb_lines_enc.to_csv('data/genesis_encoded.txt', sep='\t', index=False, header=False)
#pickle the endoding dictionary
with open('data/genesis_encoding_dict.pkl', 'wb') as f:
    pkl.dump(heb_enc_padded, f)

In [13]:
gen_eng_df = pd.read_csv('data/akjv.txt', sep='\t', header=None)
#remove all rows where column 0 doesn not start with Genesis
gen_eng_df = gen_eng_df[gen_eng_df[0].str.startswith('Genesis')]
#drop column 0
gen_eng_df = gen_eng_df.drop(columns=0)
#sav to txt file
gen_eng_df.to_csv('data/genesis_english.txt', sep='\t', index=False, header=False)