In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pymongo
import pandas as pd

In [2]:
def get_all_preprocessed_content():
    my_client: pymongo.MongoClient = pymongo.MongoClient("mongodb://localhost:27017/")
    my_db: pymongo.database.Database = my_client["final-year-project"]
    content: pymongo.database.Collection = my_db["preprocessed_content"]
    try:
        result = content.find()
        return list(result)
    except Exception as e:
        print(f"An error occurred: {e}")
        return []
    
def update_preprocessed_content_table(preprocessed_content: dict):
    my_client: pymongo.MongoClient = pymongo.MongoClient("mongodb://localhost:27017/")
    my_db: pymongo.database.Database = my_client["final-year-project"]
    preprocessed_content_collection: pymongo.database.Collection = my_db["preprocessed_content"]
    try:
        # Define the filter and update operation
        filter_criteria = {"_id": preprocessed_content["_id"]}
        update_operation = {"$set": preprocessed_content}
        
        print("Going to update")
        result = preprocessed_content_collection.update_one(filter_criteria, update_operation)
        
        if result.matched_count > 0:
            print(f"Updated document with ID: {preprocessed_content['_id']}")
        else:
            print("No matching document found for update.")
        
    except Exception as e:
        print(f"An error occurred: {e}")

In [13]:
preprocessed_content_data = get_all_preprocessed_content()
df = pd.DataFrame(preprocessed_content_data)
preprocessed_text_list = df["preprocessed_text"].to_list()
filtered_tokenized_texts = [' '.join(doc) for doc in preprocessed_text_list]

In [15]:
len(filtered_tokenized_texts)

11751

In [20]:
# Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_tokenized_texts)

# Get terms and their TF-IDF scores
feature_names = tfidf_vectorizer.get_feature_names_out()
dense = tfidf_matrix.todense()

# Example: Top terms for a specific document
doc_index = 10000  # Change this to analyze other documents
doc_tfidf = dense[doc_index].tolist()[0]
term_scores = [(feature_names[i], score) for i, score in enumerate(doc_tfidf) if score > 0]
sorted_terms = sorted(term_scores, key=lambda x: x[1], reverse=True)

print(f"Top terms for Document {doc_index}:")
for term, score in sorted_terms[:10]:
    print(f"{term}: {score}")

Top terms for Document 10000:
alphabet: 0.5003707744064603
hieroglyphs: 0.2792379782045501
rebus: 0.24237631068128374
puzzle: 0.18224119170775874
egyptian: 0.17310734424750893
eye: 0.155990709020018
language: 0.15234216327618486
image: 0.14405365494937844
letter: 0.1395802003853798
sound: 0.13651591538009447


In [22]:
preprocessed_text_list[10000]

['civilizations',
 'invent',
 'alphabet',
 'adaptation',
 'latin',
 'alphabet',
 'hebrew',
 'arabic',
 'alphabet',
 'korean',
 'alphabet',
 'indian',
 'alphabets',
 'native',
 'indonesian',
 'alphabets',
 'alphabet',
 'alphabet',
 'ask',
 'civilizations',
 'invent',
 'write',
 'systems',
 'different',
 'question',
 'answer',
 'short',
 'ancient',
 'mesopotamians',
 'egyptians',
 'chinese',
 'mayans',
 'ones',
 'know',
 'invent',
 'write',
 'systems',
 'alphabets',
 'advance',
 'mostlyphonetic',
 'picture',
 'write',
 'rebus',
 'write',
 'fancy',
 'example',
 'use',
 'explain',
 'rebus',
 'principle',
 'work',
 'need',
 'hieroglyphs',
 'language',
 'alphabet',
 'language',
 'hieroglyphs',
 'write',
 'write',
 'language',
 'call',
 'egyptian',
 'speak',
 'egyptian',
 'speak',
 'hieroglyphs',
 'image',
 'word',
 'puzzle',
 'call',
 'rebus',
 'picture',
 'stand',
 'word',
 'sentence',
 'particular',
 'rebus',
 'eye',
 'sink',
 'tear',
 'eye',
 'ham',
 'solve',
 'puzzle',
 'read',
 'previou

In [39]:
import spacy

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

def lemmatize_with_plural_check(text):
    doc = nlp(text)
    lemmatized_tokens = []

    for token in doc:
        # Check if the token is a plural noun
        if token.pos_ == 'NOUN' and token.tag_ in ('NNS', 'NNPS'):
            lemmatized_tokens.append(token.lemma_)  # Use SpaCy's built-in lemma for nouns
        else:
            lemmatized_tokens.append(token.lemma_)  # Use lemma for all other tokens

    return lemmatized_tokens



nlp("lemmatized words")[0].lemma_


'lemmatize'

In [41]:
"asd as".split()

['asd', 'as']