In [13]:
import pymongo
import pymongo.database

import nltk
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models

import pandas as pd

In [2]:
def get_all_preprocessed_content():
    my_client: pymongo.MongoClient = pymongo.MongoClient("mongodb://localhost:27017/")
    my_db: pymongo.database.Database = my_client["final-year-project"]
    content: pymongo.database.Collection = my_db["preprocessed_content"]
    try:
        result = content.find()
        return list(result)
    except Exception as e:
        print(f"An error occurred: {e}")
        return []
    
def update_preprocessed_content_table(preprocessed_content: dict):
    my_client: pymongo.MongoClient = pymongo.MongoClient("mongodb://localhost:27017/")
    my_db: pymongo.database.Database = my_client["final-year-project"]
    preprocessed_content_collection: pymongo.database.Collection = my_db["preprocessed_content"]
    try:
        # Define the filter and update operation
        filter_criteria = {"_id": preprocessed_content["_id"]}
        update_operation = {"$set": preprocessed_content}
        
        print("Going to update")
        result = preprocessed_content_collection.update_one(filter_criteria, update_operation)
        
        if result.matched_count > 0:
            print(f"Updated document with ID: {preprocessed_content['_id']}")
        else:
            print("No matching document found for update.")
        
    except Exception as e:
        print(f"An error occurred: {e}")

<h2>LDA<h2>

In [15]:
preprocessed_content_data = get_all_preprocessed_content()
df = pd.DataFrame(preprocessed_content_data)
preprocessed_text_list = df["preprocessed_text"].to_list()
filtered_tokenized_texts = [[word for word in doc] for doc in preprocessed_text_list]

In [21]:
dictionary = corpora.Dictionary(filtered_tokenized_texts)

dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in filtered_tokenized_texts]

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Print out the topics and top words
print("Topics and top words:")
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Assign topics to documents
document_topics = lda_model.get_document_topics(corpus)

# Print the topic distribution for the first document
print("\nTopic distribution for the first document:")
print(document_topics[0])

# Optional: Evaluate the model using coherence score
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=filtered_tokenized_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print("\nCoherence Score:", coherence_score)

# Optional: Visualize the topics with pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

# Save the model for future use
lda_model.save("lda_model.model")

# Load the saved model (if needed)
# lda_model = LdaModel.load("lda_model.model")

Topics and top words:
(0, '0.022*"china" + 0.011*"power" + 0.010*"war" + 0.008*"state" + 0.008*"world"')
(1, '0.019*"bc" + 0.016*"city" + 0.016*"mesopotamia" + 0.014*"sumerians" + 0.012*"sumerian"')
(2, '0.056*"write" + 0.044*"language" + 0.044*"sumerian" + 0.023*"character" + 0.019*"languages"')
(3, '0.012*"uh" + 0.010*"let" + 0.009*"man" + 0.008*"son" + 0.008*"chariot"')
(4, '0.017*"chinese" + 0.011*"china" + 0.007*"years" + 0.006*"wall" + 0.006*"use"')
(5, '0.056*"’" + 0.044*"“" + 0.043*"”" + 0.013*"mean" + 0.012*"years"')
(6, '0.017*"gods" + 0.016*"flood" + 0.014*"god" + 0.009*"earth" + 0.009*"story"')
(7, '0.030*"civilization" + 0.024*"ancient" + 0.018*"culture" + 0.017*"civilizations" + 0.010*"human"')
(8, '0.051*"china" + 0.027*"empire" + 0.025*"chinese" + 0.014*"india" + 0.014*"japan"')
(9, '0.077*"dynasty" + 0.059*"chinese" + 0.033*"shang" + 0.026*"emperor" + 0.022*"history"')

Topic distribution for the first document:
[(5, 0.056423936), (6, 0.388845), (7, 0.55128056)]

Coher

In [None]:
filtered_tokenized_texts = [[word for word in doc] for doc in preprocessed_text_list]

dictionary = corpora.Dictionary(filtered_tokenized_texts)

dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in filtered_tokenized_texts]

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=60)

# Print out the topics and top words
print("Topics and top words:")
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Assign topics to documents
document_topics = lda_model.get_document_topics(corpus)

# Print the topic distribution for the first document
print("\nTopic distribution for the first document:")
print(document_topics[0])

# Optional: Evaluate the model using coherence score
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=filtered_tokenized_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print("\nCoherence Score:", coherence_score)



# Save the model for future use
lda_model.save("lda_model.model")

# Load the saved model (if needed)
# lda_model = LdaModel.load("lda_model.model")

Topics and top words:
(0, '0.033*"sumerian" + 0.026*"bc" + 0.025*"sumerians" + 0.018*"mesopotamia" + 0.018*"write"')
(1, '0.051*"black" + 0.026*"shia" + 0.024*"egypt" + 0.017*"african" + 0.016*"africans"')
(2, '0.008*"king" + 0.008*"let" + 0.007*"man" + 0.007*"music" + 0.006*"son"')
(3, '0.105*"language" + 0.098*"sumerian" + 0.051*"languages" + 0.036*"akkadian" + 0.026*"speak"')
(4, '0.016*"ancient" + 0.016*"sumerians" + 0.013*"east" + 0.013*"group" + 0.010*"asia"')
(5, '0.008*"society" + 0.008*"cultural" + 0.008*"influence" + 0.007*"human" + 0.007*"trade"')
(6, '0.049*"china" + 0.024*"empire" + 0.020*"india" + 0.017*"war" + 0.012*"world"')
(7, '0.085*"write" + 0.048*"chinese" + 0.045*"character" + 0.033*"script" + 0.023*"egyptian"')
(8, '0.027*"flood" + 0.025*"gods" + 0.022*"god" + 0.018*"ancient" + 0.011*"story"')
(9, '0.081*"dynasty" + 0.063*"chinese" + 0.040*"china" + 0.031*"emperor" + 0.030*"shang"')
(10, '0.080*"civilization" + 0.040*"ancient" + 0.035*"civilizations" + 0.031*"cul

In [26]:
# Optional: Visualize the topics with pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [29]:
filtered_tokenized_texts = [[word for word in doc] for doc in preprocessed_text_list]

dictionary = corpora.Dictionary(filtered_tokenized_texts)

dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in filtered_tokenized_texts]

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=80, iterations=500)

# Print out the topics and top words
print("Topics and top words:")
topics = lda_model.print_topics()
for topic in topics:
    print(topic)

# Assign topics to documents
document_topics = lda_model.get_document_topics(corpus)

# Print the topic distribution for the first document
print("\nTopic distribution for the first document:")
print(document_topics[0])

# Optional: Evaluate the model using coherence score
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=filtered_tokenized_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print("\nCoherence Score:", coherence_score)

Topics and top words:
(0, '0.037*"“" + 0.036*"”" + 0.034*"write" + 0.033*"’" + 0.025*"language" + 0.018*"sumerian" + 0.015*"word" + 0.013*"character" + 0.013*"languages" + 0.012*"chinese"')
(1, '0.055*"civilization" + 0.038*"ancient" + 0.033*"egypt" + 0.023*"bce" + 0.022*"bc" + 0.019*"civilizations" + 0.019*"indus" + 0.018*"valley" + 0.017*"sumer" + 0.016*"mesopotamia"')
(2, '0.018*"gods" + 0.016*"king" + 0.014*"sumerian" + 0.014*"flood" + 0.013*"god" + 0.011*"music" + 0.010*"bc" + 0.008*"kings" + 0.008*"city" + 0.007*"story"')
(3, '0.045*"dynasty" + 0.021*"chinese" + 0.017*"emperor" + 0.016*"shang" + 0.013*"han" + 0.013*"china" + 0.010*"empire" + 0.009*"history" + 0.009*"state" + 0.008*"rule"')
(4, '0.017*"empire" + 0.010*"east" + 0.009*"sumerians" + 0.008*"group" + 0.007*"region" + 0.007*"ancient" + 0.007*"modern" + 0.006*"asia" + 0.006*"middle" + 0.006*"south"')
(5, '0.016*"civilization" + 0.015*"culture" + 0.012*"ancient" + 0.010*"sumerians" + 0.010*"civilizations" + 0.009*"influen

In [30]:
# Optional: Visualize the topics with pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [32]:
filtered_tokenized_texts = [[word for word in doc] for doc in preprocessed_text_list]

dictionary = corpora.Dictionary(filtered_tokenized_texts)

dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in filtered_tokenized_texts]

# Train the LDA model
lda_model = LdaModel(
                    corpus, 
                    num_topics=15, 
                    id2word=dictionary, 
                    alpha="symmetric", 
                    passes=80, 
                    iterations=100
                    )

# Print out the topics and top words
print("Topics and top words:")
topics = lda_model.print_topics()
for topic in topics:
    print(topic)

# Assign topics to documents
document_topics = lda_model.get_document_topics(corpus)

# Print the topic distribution for the first document
print("\nTopic distribution for the first document:")
print(document_topics[0])

# Optional: Evaluate the model using coherence score
from gensim.models import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=filtered_tokenized_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print("\nCoherence Score:", coherence_score)

Topics and top words:
(0, '0.020*"population" + 0.015*"trade" + 0.013*"large" + 0.012*"develop" + 0.012*"land" + 0.012*"agriculture" + 0.011*"cities" + 0.011*"grow" + 0.010*"farm" + 0.009*"increase"')
(1, '0.025*"bc" + 0.022*"shang" + 0.018*"years" + 0.015*"date" + 0.014*"evidence" + 0.014*"period" + 0.012*"culture" + 0.011*"bronze" + 0.011*"bone" + 0.010*"record"')
(2, '0.098*"chinese" + 0.083*"china" + 0.031*"’" + 0.028*"ancient" + 0.015*"history" + 0.015*"years" + 0.015*"culture" + 0.011*"korea" + 0.010*"japan" + 0.009*"japanese"')
(3, '0.050*"write" + 0.045*"language" + 0.035*"sumerian" + 0.021*"character" + 0.021*"languages" + 0.020*"word" + 0.017*"script" + 0.015*"cuneiform" + 0.013*"egyptian" + 0.012*"ancient"')
(4, '0.136*"india" + 0.046*"indian" + 0.029*"pakistan" + 0.024*"black" + 0.016*"indians" + 0.010*"nuclear" + 0.009*"white" + 0.008*"hindu" + 0.008*"mankind" + 0.008*"peasants"')
(5, '0.111*"’" + 0.110*"“" + 0.106*"”" + 0.017*"mean" + 0.016*"‘" + 0.012*"term" + 0.007*"ans