# Topic Modeling on Genesis in Hebrew (Masoretic Text)

In [1]:
import sys
import os
from gensim import corpora
from gensim.models import LdaModel, HdpModel, LsiModel
from deep_translator import GoogleTranslator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from hebrew_tokenizer.tokenizer import Tokenizer
from collections import Counter
from sklearn.decomposition import NMF
from coherence.coherence_scores import compute_coherence_score_umass, compute_coherence_score_uci

/Users/eliasmann/Documents/NLP_NEU/hebrew_topic_modeling


  self.scanner = re.compile(


## Preprocessing and Tokenization 

In [2]:
# Add current directory to sys.path
sys.path.append(os.path.abspath(os.getcwd()))

file_path = 'data/genesis_hebrew.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    hebrew_text = file.read()
    
# Tokenize the Hebrew text
tokenizer = Tokenizer()
tokens = list(tokenizer.tokenize(hebrew_text))

In [55]:
# Create a list of tokenized documents
documents = [[word[1] for word in tokenizer.tokenize(line)] for line in hebrew_text.split('\n')]

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(documents)

dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]

## LDA

In [56]:
# Set the number of topics
num_topics = 5
# Train the LDA model
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

# Print the top 5 tokens with highest probability for each topic
topics = lda_model.show_topics(num_words=10, formatted=False)
for idx, topic in lda_model.show_topics(num_words=10, formatted=False):
    print(idx, [GoogleTranslator(source='auto', target='en').translate(word[0])
                    for word in topic])
    
for idx, topic in lda_model.show_topics(num_words=10, formatted=False):
    print(idx, [word[0] for word in topic])

0 ['son', 'all', 'which', 'country', 'was', 'Come on', 'said', 'Nathan', 'god', 'Lesson']
1 ['said', 'son', 'was', 'sleep', 'Boy', 'god', 'Because', 'woman', 'Gone', 'No']
2 ['said', 'which', 'brother', 'son', 'please', 'Yosef', 'was', 'all', 'Come on', 'No']
3 ['Ob', 'Because', 'was', 'all', 'country', 'See', 'said', 'read', 'Pharaoh', 'No']
4 ['said', 'which', 'Because', 'country', 'man', 'Jehovah', 'all', 'No', 'Come on', 'action']
0 ['בֵּן', 'כֹּל', 'אֲשֶׁר', 'אֶרֶץ', 'היה', 'בּוא', 'אמר', 'נתן', 'אֱלֹהִים', 'לקח']
1 ['אמר', 'בֵּן', 'היה', 'שָׁנָה', 'ילד', 'אֱלֹהִים', 'כִּי', 'אִשָּׁה', 'הלךְ', 'לֹא']
2 ['אמר', 'אֲשֶׁר', 'אָח', 'בֵּן', 'נָא', 'יוֺסֵף', 'היה', 'כֹּל', 'בּוא', 'לֹא']
3 ['אָב', 'כִּי', 'היה', 'כֹּל', 'אֶרֶץ', 'ראה', 'אמר', 'קרא', 'פַּרְעֹה', 'לֹא']
4 ['אמר', 'אֲשֶׁר', 'כִּי', 'אֶרֶץ', 'אִישׁ', 'יהוה', 'כֹּל', 'לֹא', 'בּוא', 'עשׂה']


In [57]:
hebrew_text_lines = [[word[1] for word in tokenizer.tokenize(line)] for line in hebrew_text.split('\n')]
hebrew_text_flat = [token[1] for token in tokens]


cumulative_coherence_umass = 0
cumulative_coherence_uci = 0

# Calculate coherence for each topic extracted by HDP
for topic_num, topic in topics:
    top_terms = [word[0] for word in topic]
    cumulative_coherence_umass += compute_coherence_score_umass(top_terms, hebrew_text_lines)
    # Assuming compute_coherence_score_uci is similarly defined and imported
    cumulative_coherence_uci += compute_coherence_score_uci(top_terms, hebrew_text_flat, window_size=10)

print("Average Coherence Score UMASS:", cumulative_coherence_umass/len(topics))
print("Average Coherence Score UCI:", cumulative_coherence_uci/len(topics))

Average Coherence Score UMASS: -91.67991199488442
Average Coherence Score UCI: 2.1692099877272732


## HDP

In [58]:
# Train the HDP model
hdp_model = HdpModel(corpus, dictionary)

#Print the topics
topics = hdp_model.show_topics()

for topic_id, topic in topics:
    topic_words = [word[6:] for word in topic.split(' + ')]
    print(f"Topic {topic_id}: {topic_words}\n")

Topic 0: ['גּנב', 'מַקֵּל', 'אַבְרָם', 'בְּכוֺר', 'אֱמֶת', 'צִבְעוֺן', 'מֵאָה', 'נגע', 'זקן', 'בַּת', 'דּוֺר', 'רָעָה', 'חִתִּי', 'עשׂה', 'נָחָשׁ', 'בִּנְיָמִן', 'תּוֺלָדוֺת', 'אִשָּׁה', 'בּרח', 'אַךְ']

Topic 1: ['הרה', 'ילד', 'נטה', 'עַתָּה', 'צחק', 'זָקֵן', 'זָכָר', 'יוֺנָה', 'חזק', 'קום', 'עֶלְיוֺן', 'קַיִן', 'נְקֵבָה', 'זֶה', 'דִּינָה', 'אֹזֶן', 'כֹּל', 'פֶּה', 'טוֺב', 'יְהוּדָה']

Topic 2: ['עלה', 'תֶּרַח', 'ירד', 'נַעֲרָה', 'חוה', 'קלל', 'תּוֺלָדוֺת', 'זֶרַח', 'עוֺף', 'עֲשָׂרָה', 'שְׁמֹנֶה', 'אָז', 'ישׁב', 'שֵׁנִי', 'אָח', 'חפר', 'קַיִן', 'מוֺלֶדֶת', 'מִן', 'שֵׁת']

Topic 3: ['אסף', 'אַתְּ', 'הַר', 'מְאוּמָה', 'אָב', 'שִׂמְלָה', 'שֵׁשׁ', 'יוֺסֵף', 'שְׁמֹנִים', 'אַחֵר', 'צַוָּאר', 'עֵשֶׂב', 'שׁאר', 'עשׂה', 'רחץ', 'פֶּה', 'לוֺט', 'רוּחַ', 'עזב', 'כְּנַעַן']

Topic 4: ['חָם', 'טַבָּח', 'כּוֺכָב', 'רוץ', 'צָעִיר', 'מַלְאָךְ', 'יכח', 'סְדֹם', 'יקץ', 'בּרךְ', 'הרג', 'מִדְבָּר', 'בְּאֵר', 'עִם', 'פֶּלֶג', 'הֵנָּה', 'עֵדֶר', 'כַּד', 'בָּקָר', 'נשׂא']

Topic 5: ['זֶרַע', 'יָם', 'מַרְאֶה'

In [29]:
for topic_id, topic in topics:
    topic_words = [GoogleTranslator(source='auto', target='en').translate(word[6:])
                   for word in topic.split(' + ')]
    print(f"Topic {topic_id}: {topic_words}\n")

Topic 0: ['came out', 'upper', 'Queen', 'night', 'ez', 'neck', 'Boy', 'past', 'First', 'Praise be to God', 'Blood', 'language', 'Sur', 'Location', 'Costs', 'knowledge', 'Goshen', 'Stood', 'very', 'sixty']

Topic 1: ['barrel', 'rum', 'water', 'herd', 'Ten', 'bag', 'hunger', 'Sermon', 'Moved', 'Neighbor', 'carry', 'sixty', 'hay', 'Haim', 'sir', 'an old', 'killing', 'against', 'multi-', 'six']

Topic 2: ['Crete', 'ninety', 'Eye', 'Rabetz', 'an animal', 'Bad', 'Appearance', 'Maybe', 'Tf', 'Weapon', 'Ree', 'Tool', 'ez', 'contract', 'More', 'Th', 'lie down', 'None', 'Manasseh', 'a dress']

Topic 3: ['seed', 'food', 'Moved', 'stick', 'Boy', 'lewdness', 'now', 'Hebron', 'property', 'Tf', 'send', 'Fraction', 'battle', 'Tohor', 'Mr', 'loved', 'past', 'a girl', 'Closed', 'Curse']

Topic 4: ['summer', 'Yosef', 'carry', 'Ask', 'Name', 'cow', 'which', 'Confusion', 'Rebecca', 'Oh, Eliboma', 'mortal', 'brother', 'victory', 'Closed', 'heart', 'to', 'Remesh', 'son', 'cement', 'Yes']

Topic 5: ['thank yo

In [59]:
cumulative_coherence_umass = 0
cumulative_coherence_uci = 0

# Calculate coherence for each topic extracted by HDP
for topic_num, topic in topics:
    top_terms = [word[6:] for word in topic.split(' + ')]
    cumulative_coherence_umass += compute_coherence_score_umass(top_terms, hebrew_text_lines)
    # Assuming compute_coherence_score_uci is similarly defined and imported
    cumulative_coherence_uci += compute_coherence_score_uci(top_terms, hebrew_text_flat, window_size=10)

print("Average Coherence Score UMASS:", cumulative_coherence_umass/len(topics))
print("Average Coherence Score UCI:", cumulative_coherence_uci/len(topics))

Average Coherence Score UMASS: -79.45595529795459
Average Coherence Score UCI: 3.3507918727205435


## LSA

### Bag of Words

In [60]:
# Train the HDP model
num_topics = 5
lsa_model = LsiModel(corpus, num_topics=num_topics, id2word=dictionary)

#Print the topics
topics = lsa_model.show_topics(formatted=False)

for topic_id, topic in topics:
    topic_words = [word[0] for word in topic]
    print(f"Topic {topic_id}: {topic_words}\n")

for topic_id, topic in topics:
    topic_words = [GoogleTranslator(source='auto', target='en').translate(word[0])
                    for word in topic]
    print(f"Topic {topic_id}: {topic_words}\n")

Topic 0: ['אמר', 'אֲשֶׁר', 'כֹּל', 'בֵּן', 'אֶרֶץ', 'כִּי', 'היה', 'אֱלֹהִים', 'אָב', 'לֹא']

Topic 1: ['אמר', 'כֹּל', 'אֶרֶץ', 'אֲשֶׁר', 'היה', 'שָׁנָה', 'לֹא', 'כִּי', 'מִצְרַיִם', 'אָח']

Topic 2: ['בֵּן', 'ילד', 'שָׁנָה', 'כֹּל', 'בַּת', 'אמר', 'מֵאָה', 'אֵלֶּה', 'שֵׁם', 'אִשָּׁה']

Topic 3: ['אֲשֶׁר', 'שָׁנָה', 'היה', 'בֵּן', 'יוֺם', 'אֶרֶץ', 'מֵאָה', 'כִּי', 'לֹא', 'ילד']

Topic 4: ['אֲשֶׁר', 'אֶרֶץ', 'שָׁנָה', 'בֵּן', 'היה', 'יוֺם', 'כִּי', 'עשׂה', 'אָב', 'אֱלֹהִים']

Topic 0: ['said', 'which', 'all', 'son', 'country', 'Because', 'was', 'god', 'Ob', 'No']

Topic 1: ['said', 'all', 'country', 'which', 'was', 'sleep', 'No', 'Because', 'Egypt', 'brother']

Topic 2: ['son', 'Boy', 'sleep', 'all', 'a girl', 'said', 'century', 'goddess', 'Name', 'woman']

Topic 3: ['which', 'sleep', 'was', 'son', 'a day', 'country', 'century', 'Because', 'No', 'Boy']

Topic 4: ['which', 'country', 'sleep', 'son', 'was', 'a day', 'Because', 'action', 'Ob', 'god']



In [61]:
for topic_num, topic in topics:
    top_terms = [word[0] for word in topic]
    cumulative_coherence_umass += compute_coherence_score_umass(top_terms, hebrew_text_lines)
    # Assuming compute_coherence_score_uci is similarly defined and imported
    cumulative_coherence_uci += compute_coherence_score_uci(top_terms, hebrew_text_flat, window_size=10)

print("Average Coherence Score UMASS:", cumulative_coherence_umass/len(topics))
print("Average Coherence Score UCI:", cumulative_coherence_uci/len(topics))

Average Coherence Score UMASS: -411.41778097625985
Average Coherence Score UCI: 15.571700900619172


### TF-IDF

In [65]:
# Tokenize the Hebrew text
tokenizer = Tokenizer()

documents = [' '.join(word[1] for word in tokenizer.tokenize(line)) for line in hebrew_text.split('\n')] 

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Fit and transform the documents into TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Define the number of components for LSA
num_components = 10

# Apply Truncated SVD to perform LSA
lsa_model = TruncatedSVD(n_components=num_components)
lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

# Print the topics (components) and the top words for each component
terms = vectorizer.get_feature_names_out()
for i, component in enumerate(lsa_model.components_):
    top_terms = [terms[term_idx] for term_idx in component.argsort()[:-6:-1]]
    print(f"Component {i+1}: {', '.join(top_terms)}")


for i, component in enumerate(lsa_model.components_):
    top_terms = [GoogleTranslator(source='auto', target='en').translate(terms[term_idx] )
                 for term_idx in component.argsort()[:-6:-1]]
    print(f"Component {i+1}: {', '.join(top_terms)}")

Component 1: אמר, ים, היה, יו, יש
Component 2: ילד, חיה, ים, עו, הרה
Component 3: היה, יו, מות, הו, וא
Component 4: ים, עש, יו, מות, יהוה
Component 5: יש, ים, לקח, נתן, הלך
Component 6: יו, וא, יש, בר, ים
Component 7: נתן, וא, יו, לקח, עש
Component 8: וא, יהוה, הו, דו, ים
Component 9: דו, הו, יהוה, יו, קרא
Component 10: קרא, ראה, עו, יהוה, יצא
Component 1: said, sea, was, Yu, there is
Component 2: Boy, an animal, sea, O, pregnant
Component 3: was, Yu, death, Oh, And
Component 4: sea, moth, Yu, death, Jehovah
Component 5: there is, sea, Lesson, Nathan, Gone
Component 6: Yu, And, there is, bar, sea
Component 7: Nathan, And, Yu, Lesson, moth
Component 8: And, Jehovah, Oh, two, sea
Component 9: two, Oh, Jehovah, Yu, read
Component 10: read, See, O, Jehovah, came out


In [66]:
cumulative_coherence = 0
cumulative_uci = 0
for i, topic in enumerate(lsa_model.components_):
    top_terms_idx = topic.argsort()[:-11:-1]
    top_terms = [terms[idx] for idx in top_terms_idx]
    cumulative_coherence += compute_coherence_score_umass(top_terms, hebrew_text_lines)
    cumulative_uci += compute_coherence_score_uci(top_terms, hebrew_text_flat, window_size=10)

print("Average Coherence Score UMASS:", cumulative_coherence/len(lsa_model.components_))
print("Average Coherence Score UCI:", cumulative_uci/len(lsa_model.components_))

Average Coherence Score UMASS: -12.763825758372425
Average Coherence Score UCI: 2.5806430508707927


## NMF

In [11]:
# Apply NMF to the TF-IDF matrix
nmf_model = NMF(n_components=num_topics)
nmf_matrix = nmf_model.fit_transform(tfidf_matrix)

# Print the topics
print("NMF Topics:")
for i, component in enumerate(nmf_model.components_):
    top_terms = [terms[j] for j in component.argsort()[-5:]]  # Top 5 terms per topic
    print(f"Topic {i+1}: {' '.join(top_terms)}")


NMF Topics:
Topic 1: לקח הו ילד וא אמר
Topic 2: מו או עו קו ים
Topic 3: דו וא נתן עש יו
Topic 4: ראה לקח נתן עש היה
Topic 5: אש הו וא ית יש
