In [1]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client['search_engine']
collection = db['problems']


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Retrieve all problem titles from MongoDB
documents = collection.find({}, {"title": 1})
titles = [doc['title'] for doc in documents]



In [3]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(titles)

# Store TF-IDF matrix in MongoDB
# tfidf_data = {
#     "tfidf_matrix": tfidf_matrix.toarray().tolist(),
#     "feature_names": vectorizer.get_feature_names_out().tolist()
# }
# db['tfidf'].insert_one(tfidf_data)


In [4]:
document_ids = [doc['_id'] for doc in collection.find({})]

In [5]:
def search_problems(query, n=10):
    # Transform the query into a TF-IDF vector
    query_vector = vectorizer.transform([query])
    
    # Calculate similarity scores for each document in the TF-IDF matrix
    scores = (query_vector * tfidf_matrix.T).toarray()[0]
    
    # Get the indices of the top `n` relevant documents
    relevant_indices = scores.argsort()[-n:][::-1]

    # Fetch documents by IDs for the top relevant indices
    relevant_problems = []
    for idx in relevant_indices:
        problem = collection.find_one({'_id': document_ids[idx]}) 
        if problem:
            relevant_problems.append(problem)

    # Display the relevant problems
    for problem in relevant_problems:
        print(f"Title: {problem['title']}, Link: {problem['link']}, Difficulty: {problem['difficulty']}")


In [6]:
search_problems("two sum")

Title: maximum sum of two non-overlapping subarrays, Link: https://leetcode.com/problems/maximum-sum-of-two-nonoverlapping-subarrays/description/, Difficulty: Med.
Title: two sum ii - input array is sorted, Link: https://leetcode.com/problems/two-sum-ii-input-array-is-sorted/description/, Difficulty: Med.
Title: minimum ascii delete sum for two strings, Link: https://leetcode.com/problems/minimum-ascii-delete-sum-for-two-strings/description/, Difficulty: Med.
Title: intersection of two arrays, Link: https://leetcode.com/problems/intersection-of-two-arrays/description/, Difficulty: Easy
Title: intersection of two arrays ii, Link: https://leetcode.com/problems/intersection-of-two-arrays-ii/description/, Difficulty: Easy
Title: median of two sorted arrays, Link: https://leetcode.com/problems/median-of-two-sorted-arrays/description/, Difficulty: Hard
Title: target sum, Link: https://leetcode.com/problems/target-sum/description/, Difficulty: Med.
Title: delete operation for two strings, Lin

In [12]:
query_vector = vectorizer.transform(["two sum"])
print(query_vector.shape)

(1, 735)


In [95]:
tfidf_matrix.shape

(572, 735)

In [96]:
documents = collection.find({}, {"title": 1})
titles = [doc['title'] for doc in documents]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(titles)