# Topic Modeling on English Translation of Genesis

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from collections import defaultdict
from gensim.models import HdpModel
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from gensim import corpora, models
from gensim.corpora import Dictionary 
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from coherence.coherence_scores import compute_coherence_score_umass, compute_coherence_score_uci

## Data Preprocessing

In [4]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Remove empty tokens and tokens with length less than 2
    tokens = [token for token in tokens if token.strip() and len(token) > 1]
    
    return tokens

def create_dtm(text):
    dtm = defaultdict(dict)
    lines = text.strip().split('\n')        
    for idx, line in enumerate(lines):
        # Treat each line as a separate document
        verse = f"Line {idx + 1}"
        verse_text = preprocess_text(line)
        for word in verse_text:
            if word in dtm[verse]:
                dtm[verse][word] += 1
            else:
                dtm[verse][word] = 1
                        
    return dtm

# Path to the English Genesis file
eng_genesis_path = 'data/genesis_english.txt'
with open(eng_genesis_path, 'r') as file:
    eng_genesis_text = file.read()

# Preprocess the text
preprocessed_text = ' '.join(preprocess_text(eng_genesis_text))

# Create the Document-Term Matrix (DTM)
dtm = create_dtm(eng_genesis_text)

# Create the vocabulary
vocab = set()
for verse in dtm:
    for word in dtm[verse]:
        vocab.add(word)

#create numpy array of vocab
terms = list(vocab)
terms.sort()
terms = np.array(terms)

## Latent Dirichlet Allocation (LDA)
Assumes that each document is a mixture of topics and that each word in the document is attributable to one of the document's topics. The goal of LDA is to find the topics that best describe the collection of documents.  

In [9]:
# Create a list of tokenized documents
documents = [[word for word in word_counts.keys()] for word_counts in dtm.values()]

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(documents)

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Apply LDA
num_topics = 5
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

# Print the topics
topics = lda_model.show_topics(num_words=10)
for topic in topics:
    topic_words = [word[7:-1] for word in topic[1].split(' + ')]
    print(topic[0], topic_words)

0 ['lord', 'said', 'shall', 'men', 'god', 'everi', 'brother', 'joseph', 'son', 'man']
1 ['said', 'shall', 'god', 'son', 'jacob', 'day', 'father', 'abraham', 'lord', 'daughter']
2 ['son', 'said', 'god', 'abraham', 'daughter', 'took', 'name', 'land', 'wife', 'year']
3 ['said', 'came', 'land', 'father', 'lord', 'son', 'shall', 'jacob', 'god', 'brother']
4 ['said', 'father', 'shall', 'behold', 'brother', 'son', 'lord', 'went', 'go', 'hous']


In [11]:
preprocessed_text_flat = preprocess_text(eng_genesis_text)
preprocessed_text_lines = [preprocess_text(line) for line in eng_genesis_text.split('\n')]


# Example to compute UMass and UCI coherence for LDA topics
cumulative_coherence_umass = 0
cumulative_coherence_uci = 0

for topic_idx, topic in lda_model.show_topics(formatted=False, num_topics=num_topics, num_words=10):
    top_terms = [word for word, prob in topic]
    cumulative_coherence_umass += compute_coherence_score_umass(top_terms, preprocessed_text_lines)
    # Assuming compute_coherence_score_uci is defined similarly to compute_coherence_score_umass
    cumulative_coherence_uci += compute_coherence_score_uci(top_terms, preprocessed_text_flat, window_size=10)


print("Average Coherence Score UMASS:", cumulative_coherence_umass/num_topics)
print("Average Coherence Score UCI:", cumulative_coherence_uci/num_topics)

Average Coherence Score UMASS: -105.47551766940516
Average Coherence Score UCI: 2.1139594496717518


## Hierarchical Dirichlet Process (HDP)
A non-parametric Bayesian method that allows the number of topics to be inferred from the data. It is an extension of LDA that infers the number of topics from the data.

In [63]:
# Step 1: Preprocess text
preprocessed_text = [preprocess_text(verse_text) for verse_text in eng_genesis_text.split('\n')]

# Step 2: Create a dictionary and document-term matrix
dictionary = Dictionary(preprocessed_text)
corpus = [dictionary.doc2bow(text) for text in preprocessed_text]

# Step 3: Apply HDP model
hdp_model = HdpModel(corpus, dictionary)

# Get topics
topics = hdp_model.show_topics()
for topic in topics:
    topic_words = [word[6:] for word in topic[1].split(' + ')]
    print(topic[0], topic_words)

0 ['philistin', 'without', 'royal', 'laban', 'everlast', 'hamathit', 'preciou', 'camel', 'run', 'bound', 'endu', 'abomin', 'worth', 'hobah', 'name', 'stop', 'kept', 'garment', 'bethlehem', 'charg']
1 ['lehabim', 'third', 'sweet', 'certain', 'canaan', 'tith', 'girgashit', 'accad', 'call', 'content', 'midwif', 'whale', 'rain', 'whomsoev', 'aner', 'thought', 'hundr', 'strip', 'sword', 'becom']
2 ['vision', 'heat', 'help', 'give', 'drew', 'beriah', 'rephaim', 'ram', 'canaan', 'hid', 'wors', 'add', 'dinhabah', 'put', 'rode', 'merri', 'distress', 'refus', 'accept', 'land']
3 ['deceit', 'fulfil', 'finish', 'seventeenth', 'abraham', 'wit', 'elbethel', 'readi', 'amraphel', 'twin', 'risen', 'ever', 'wittingli', 'fame', 'long', 'garment', 'statut', 'white', 'iniqu', 'beheld']
4 ['stead', 'obtain', 'ziphion', 'magician', 'overtook', 'stand', 'breast', 'penuel', 'part', 'zepho', 'kiriathaim', 'alvan', 'ararat', 'break', 'heir', 'noon', 'bank', 'pildash', 'done', 'shearer']
5 ['last', 'onan', 'omar'

In [70]:
cumulative_coherence_umass = 0
cumulative_coherence_uci = 0

# Calculate coherence for each topic extracted by HDP
for topic_num, topic in topics:
    top_terms = [word[6:] for word in topic.split(' + ')]
    cumulative_coherence_umass += compute_coherence_score_umass(top_terms, preprocessed_text_lines)
    # Assuming compute_coherence_score_uci is similarly defined and imported
    cumulative_coherence_uci += compute_coherence_score_uci(top_terms, preprocessed_text_flat, window_size=10)

print("Average Coherence Score UMASS:", cumulative_coherence_umass/len(topics))
print("Average Coherence Score UCI:", cumulative_coherence_uci/len(topics))

Average Coherence Score UMASS: -9.86607105025642
Average Coherence Score UCI: 4.69697982093691


In [35]:
# import matplotlib.pyplot as plt
# from wordcloud import WordCloud
# import numpy as np

# # Function to create word clouds for each topic
# def create_wordclouds(hdp_model):
#     topics = hdp_model.show_topics()
#     for topic_id, topic in topics:
#         word_freq = {word: float(freq) for freq, word in [pair.split('*') for pair in topic.split(' + ')]}
#         wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(word_freq)
        
#         plt.figure(figsize=(10, 5))
#         plt.imshow(wordcloud, interpolation='bilinear')
#         plt.title(f'Topic {topic_id} Word Cloud')
#         plt.axis('off')
#         plt.show()

# # Function to plot topic distribution across documents
# def plot_topic_distribution(hdp_model, corpus):
#     topic_dist = hdp_model[corpus]
#     topics = [dict(topic) for topic in topic_dist]
#     topic_matrix = np.zeros((len(corpus), len(hdp_model.show_topics())))

#     for doc_id, doc_topics in enumerate(topics):
#         for topic_id, topic_prob in doc_topics.items():
#             topic_matrix[doc_id][topic_id] = topic_prob

#     plt.figure(figsize=(12, 6))
#     plt.imshow(topic_matrix, aspect='auto', cmap='viridis')
#     plt.colorbar(label='Probability')
#     plt.title('Topic Distribution across Documents')
#     plt.xlabel('Topic')
#     plt.ylabel('Document')
#     plt.show()

# # Assuming `hdp_model` and `corpus` are already defined

# # Visualize word clouds for each topic
# create_wordclouds(hdp_model)

# # Visualize topic distribution across documents
# plot_topic_distribution(hdp_model, corpus)


## Latent Semantic Analysis (LSA)
A technique in natural language processing, in particular in vectorial semantics, of analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms.

###  Bag of Words
A simplifying representation used in natural language processing and information retrieval. In this model, a text is represented as the bag of its words, disregarding grammar and even word order but keeping multiplicity.

In [83]:
# Create a list of tokenized documents
documents = [[word for word in word_counts.keys()] for word_counts in dtm.values()]

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(documents)

# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Apply LSA
num_topics = 5
lsa_model = models.LsiModel(corpus, num_topics=num_topics, id2word=dictionary)

# Print the topics
topics = lsa_model.show_topics(num_topics=num_topics, num_words=10)
for topic in topics:
    topic_words = [word.split('\"')[1] for word in topic[1].split(' + ')]
    print(topic[0], topic_words)


0 ['said', 'son', 'shall', 'god', 'lord', 'father', 'brother', 'jacob', 'came', 'land']
1 ['son', 'said', 'daughter', 'year', 'jacob', 'came', 'lord', 'hundr', 'shall', 'begat']
2 ['son', 'came', 'land', 'said', 'joseph', 'brother', 'father', 'god', 'pass', 'egypt']
3 ['shall', 'said', 'god', 'earth', 'jacob', 'everi', 'year', 'live', 'day', 'lord']
4 ['shall', 'god', 'came', 'pass', 'year', 'lord', 'son', 'abraham', 'said', 'earth']


In [84]:
cumulative_coherence_umass = 0
cumulative_coherence_uci = 0

# Calculate coherence for each topic extracted by HDP
for topic_num, topic in topics:
    top_terms = [word.split('\"')[1] for word in topic.split(' + ')]
    cumulative_coherence_umass += compute_coherence_score_umass(top_terms, preprocessed_text_lines)
    # Assuming compute_coherence_score_uci is similarly defined and imported
    cumulative_coherence_uci += compute_coherence_score_uci(top_terms, preprocessed_text_flat, window_size=10)

print("Average Coherence Score UMASS:", cumulative_coherence_umass/len(topics))
print("Average Coherence Score UCI:", cumulative_coherence_uci/len(topics))

Average Coherence Score UMASS: -100.18697587908224
Average Coherence Score UCI: 2.14865509797958


### TF-IDF
A numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in information retrieval and text mining.

In [85]:

# Preprocess the text
preprocessed_text = preprocess_text(eng_genesis_text)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_text)

# Apply SVD to the TF-IDF matrix
n_components = 10 # Number of topics
lsa_model = TruncatedSVD(n_components=n_components, random_state=42)
lsa_topic_matrix = lsa_model.fit_transform(X)

# Print the top words for each topic and calculate coherence
terms = vectorizer.get_feature_names_out()
for i, topic in enumerate(lsa_model.components_):
    top_terms_idx = topic.argsort()[:-11:-1]
    top_terms = [terms[idx] for idx in top_terms_idx]
    print(f"Topic {i}: {' | '.join(top_terms)}")

Topic 0: said | daughter | name | hand | us | hous | come | earth | everi | esau
Topic 1: shall | behold | daughter | earth | name | brother | go | esau | one | servant
Topic 2: son | wife | isaac | call | everi | daughter | pass | abraham | earth | let
Topic 3: god | man | behold | daughter | wife | brother | hous | lord | took | call
Topic 4: lord | father | hand | wife | came | joseph | everi | isaac | pharaoh | went
Topic 5: father | behold | say | brother | daughter | abraham | earth | took | us | name
Topic 6: land | behold | abraham | brother | man | wife | also | let | name | jacob
Topic 7: jacob | came | brother | behold | hand | say | us | egypt | day | took
Topic 8: came | brother | man | behold | go | pass | abraham | daughter | year | bless
Topic 9: brother | joseph | hand | name | pharaoh | went | day | also | wife | year


In [86]:
preprocessed_text_flat = preprocess_text(eng_genesis_text)
preprocessed_text_lines = [preprocess_text(line) for line in eng_genesis_text.split('\n')]

cumulative_coherence = 0
cumulative_uci = 0
for i, topic in enumerate(lsa_model.components_):
    top_terms_idx = topic.argsort()[:-11:-1]
    top_terms = [terms[idx] for idx in top_terms_idx]
    cumulative_coherence += compute_coherence_score_umass(top_terms, preprocessed_text_lines)
    cumulative_uci += compute_coherence_score_uci(top_terms, preprocessed_text_flat, window_size=10)

print("Average Coherence Score UMASS:", cumulative_coherence/len(lsa_model.components_))
print("Average Coherence Score UCI:", cumulative_uci/len(lsa_model.components_))

Average Coherence Score UMASS: -120.18789376321706
Average Coherence Score UCI: 1.9613278023196685


## Non-Negative Matrix Factorization (NMF)
A group of algorithms in multivariate analysis and linear algebra where a matrix V is factorized into (usually) two matrices W and H, with the property that all three matrices have no negative elements. In topic modeling, the matrix V represents the documents, W represents the topics, and H represents the weights of the topics in the documents.

In [30]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_text)

# Apply NMF to the TF-IDF matrix
n_components = 10  # Number of topics
nmf_model = NMF(n_components=n_components, random_state=42)
nmf_topic_matrix = nmf_model.fit_transform(X)

# Get the feature names (tokens)
feature_names = vectorizer.get_feature_names_out()

# Print topics, tokens, and corresponding vectors
# print("Topics, tokens, and corresponding vectors:")
# for i, topic_vector in enumerate(nmf_model.components_):
#     print(f"Topic {i+1}:")
#     for token, weight in zip(feature_names, topic_vector):
#         print(f"{token}: {weight}")
#     print()

In [31]:
    # Print the topics
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(nmf_model.components_):
        print(f"Topic {topic_idx + 1}:")
        top_idx = topic.argsort()[:-11:-1]
        top_terms = [feature_names[i] for i in top_idx]
        print(", ".join(top_terms))

Topic 1:
said, took, isaac, one, egypt, daughter, made, live, give, begat
Topic 2:
shall, day, abraham, year, earth, went, behold, hand, call, daughter
Topic 3:
son, day, joseph, year, behold, abraham, went, earth, hand, man
Topic 4:
god, joseph, day, man, behold, abraham, year, daughter, earth, went
Topic 5:
lord, joseph, day, abraham, year, hand, name, went, earth, man
Topic 6:
father, joseph, abraham, day, behold, earth, daughter, year, say, name
Topic 7:
land, joseph, abraham, behold, day, man, year, earth, name, wife
Topic 8:
jacob, joseph, day, behold, abraham, hand, went, say, year, earth
Topic 9:
came, abraham, man, behold, joseph, day, daughter, year, earth, say
Topic 10:
brother, joseph, day, year, name, hand, went, abraham, wife, call
