In [25]:
from transformers import BertModel, BertTokenizer, pipeline
import torch
from pymongo import MongoClient
import certifi
import fitz
from scipy.spatial.distance import cosine
import os
from sentence_transformers import SentenceTransformer
import openai

### Define Tokenizer, Model, and Database

In [27]:
# Here we utilize the Bert tokenizer in order to tokenize the input text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Here we utilize the Sentence Transformer model to understand the semantic meaning of the input text. This will come in 
# handy when we want to find the most similar sentence to a given input sentence
model = SentenceTransformer('all-mpnet-base-v2')
# This is the connection to the MongoDB database on its Atlas platform that we are using to store the data
uri = "mongodb+srv://sveerisetti:HRsAs1@assignment1cluster.lkuwikx.mongodb.net/?retryWrites=true&w=majority"
ca = certifi.where()
# Here we create the client that will be used to connect to the MongoDB database
client = MongoClient(uri, tlsCAFile=ca)
# Here we select the database and the collection that we want to use
db = client['Marvel']
collection = db['superhero_chunk']
# This is the key for the OpenAI API that we will use to generate the responses to the user's questions
openai.api_key = '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'

In [28]:
# This function will take in a string and return the embeddings for the string. 
# Here we create a vector of embeddings for the input text
def generate_embedding(text):
    return model.encode(text)

### Function to Extract Text and Store Chunks in MongoDB

In [29]:
def extract_text_and_store_with_embedding(directory_path):
    """
    Purpose: To extract text from .txt files in a directory and store the text along with its embedding in MongoDB.
    directory_path: The path to the directory containing .txt files.
    """
    # Here we divide the text into chunks of 100 words
    def chunk_words(words, chunk_size=100):
        # Here we iterate through the words in the text and create chunks of 100 words
        for i in range(0, len(words), chunk_size): 
            yield ' '.join(words[i:i+chunk_size])

    # Here we iterate over the words in the text and create chunks of 100 words. 
    # The main point is to direct the function to the directory where the text files are stored
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):  
            # Here we only focus on the .txt files in the directory
            txt_path = os.path.join(directory_path, filename)
            # We open the text file and read the text
            with open(txt_path, 'r', encoding='utf-8') as file:
                text = file.read()
                words = text.split()
                # Here we iterate over the chunks of 100 words and store the chunks in the MongoDB database
                for chunk in chunk_words(words):
                    # The chunk embedding is generated using the Sentence Transformer model
                    chunk_embedding = generate_embedding(chunk).tolist()  
                    # Here we store the chunk and its embedding in the MongoDB database
                    collection.insert_one({
                        "chunk": chunk,
                        "embedding": chunk_embedding,
                        "source": os.path.basename(txt_path)
                    })
            print(f"Successfully stored chunks from: '{filename}'")

In [30]:
# Sample Execution
# Set the directory path to the directory containing the .txt files
directory_path = "/Users/sveerisetti/Desktop/Duke_Spring/LLM/Assignments/Assignment2/Marvel/Lore2"  
extract_text_and_store_with_embedding(directory_path)

Processed and stored chunks from 'Avenger_Strange.txt'
Processed and stored chunks from 'AntMan_Quantumania.txt'
Processed and stored chunks from 'wakanda_forever.txt'
Processed and stored chunks from 'Echo.txt'
Processed and stored chunks from 'love_and_thunder.txt'
Processed and stored chunks from 'Avenger_Loki.txt'
Processed and stored chunks from 'doctor_strange_summary.txt'
Processed and stored chunks from 'Avenger_Thor.txt'
Processed and stored chunks from 'Avengers_Echo.txt'
Processed and stored chunks from 'Avenger_GOG3.txt'
Processed and stored chunks from 'Avengers_Marvels.txt'
Processed and stored chunks from 'Avenger_Antman.txt'
Processed and stored chunks from 'Avenger_BlackPanther.txt'
Processed and stored chunks from 'Loki.txt'
Processed and stored chunks from 'GOG_3.txt'
Processed and stored chunks from 'Marvels.txt'


### Find the Most Relevant Chunks for a Given Query Based on Cosine Similarity

In [31]:
def find_most_relevant_chunks(query, top_k=5):
    """
    Purpose: Here we want to find the most relevant chunks from the MongoDB database to a given query.
    query: The input query for which we want to find the most relevant chunks.
    top_k: The number of most relevant chunks to return.
    """
    # Here we genearte the embedding for the query using the Sentence Transformer model and the generate_embedding function. 
    query_embedding = generate_embedding(query)
    docs = collection.find({})

    # Here we gather all of the document embeddings and store them in a list. These are the similarities between the query and the chunks
    # in the MongoDB database. 
    similarities = []
    for doc in docs:
        # To perform the cosine similarity function we need to make sure that the embeddings are in the correct format. 
        # To do this, we convert the embeddings to a list of floats.
        chunk_embedding = [float(value) for value in doc['embedding']]
        query_embedding = [float(value) for value in query_embedding]
        # Here we calculate the cosine similarity between the query and the chunk in the MongoDB database
        similarity = 1 - cosine(chunk_embedding, query_embedding)
        # We then append the chunk, similarity, and source to the similarities list
        similarities.append((doc['chunk'], similarity, doc.get('source')))

    # Here sort the similarities list by the similarity score in descending order. 
    # The top will be the most similar chunks to the input query.
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    seen_chunks = set()
    unique_similarities = []
    # Here we loop through the similarities list and append the unique chunks to the unique_similarities list
    for chunk, similarity, source in similarities:
        # Checks for duplicate chunks
        if chunk not in seen_chunks:
            # Adds the chunk to the seen_chunks set
            seen_chunks.add(chunk)
            # Appends the unique chunk to the unique_similarities list
            unique_similarities.append((chunk, similarity, source))
            # Makes sure that the number of returned chunks is equal to the top_k
            if len(unique_similarities) == top_k:
                break
    return unique_similarities

### Generate the Enhanced Prompt with the Most Relevant Chunks

In [52]:
def generate_prompt_with_context(relevant_chunks, query):
    """
    Purpose: Generates a prompt with the relevant chunks and the input query.
    relevant_chunks: The most relevant chunks to the input query.
    query: The input query.
    """
    # Here we build context for the prompt by adding the relevant chunks to the prompt
    context = "Based on the following information: "
    # Here we loop through the relevant chunks and add them to the context
    for chunk, similarity, source in relevant_chunks:
        # Here we add the source and similarity of the chunk to the context for the user to see 
        context += f"\n- [Source: {source}, Similarity: {similarity}]: {chunk}"
    # Here we concatenate the context and the input query to create the prompt to make a more efficient response
    prompt = f"{context}\n\n{query}"
    return prompt

def generate_text_with_gpt35(prompt, max_tokens=3100):
    """
    Purpose: Generates a response to the user's query using the GPT-3.5 model.
    prompt: The prompt to generate the response.
    max_tokens: The maximum number of tokens for the response.
    """
    # Here we generate the response to the user's query using the GPT-3.5 model
    # We use the openai.ChatCompletion.create function to generate the response
    response = openai.ChatCompletion.create(
        # Define the model of choice
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        # Define the maximum number of tokens for the response
        max_tokens=max_tokens,
        temperature=0.7,
        n=1,
        stop=None
    )
    return response.choices[0].message['content'].strip()

### Bring it All Together

In [53]:
# The main function that is used to execute the code
if __name__ == "__main__":
    # The query that the user wants to ask
    query = "How did Ouroboros help in Season 2 of Loki?"
    # Call the function to find the most relevant chunks in the MongoDB database to the input query
    relevant_chunks = find_most_relevant_chunks(query)
    # If any relevant chunks were found, we will use these chunks to supplement the language model
    if relevant_chunks:
        print("Relevant chunks used to supplement the language model:")
        # Here we loop through the relevant chunks and print them out to the user. 
        # Then we give them the similarity and source of the chunk
        for chunk, similarity, source in relevant_chunks:
            print(f"Chunk: {chunk}\nSimilarity: {similarity}\nSource: {source}\n")
        
        # Here we generate the prompt with the relevant chunks and the input query
        prompt = generate_prompt_with_context(relevant_chunks, query)
        # Here we use the GPT-3.5 model to generate the response to the user's query
        generated_text = generate_text_with_gpt35(prompt)
        # Here we print the generated text to the user
        print(f"Generated Text: {generated_text}\n")
    else:
        # Alternatively, if no relevant chunks were found, we will print a message to the user
        print("No relevant chunks found in the database for the query.")

Relevant chunks used to supplement the language model:
Chunk: Martin and Katharyn Blair Justin Benson & Aaron Moorhead The TVA 's Loom nears catastrophic failure but Loki , Mobius , and Sylvie have a He Who Remains variant . Science/Fiction 2.05 November 2, 2023 Eric Martin Justin Benson & Aaron Moorhead Loki traverses dying timelines in an attempt to find his friends, but Reality is not what it seems. Glorious Purpose 2.06 November 9, 2023 Eric Martin Justin Benson & Aaron Moorhead Loki learns the nature of "glorious purpose" as he rectifies the past in this gripping finale. Videos [ ] Trailers [ ] Marvel Studios’ Loki Season 2
Similarity: 0.5311649889887929
Source: Avenger_Loki.txt

Chunk: onto a Time Variance Authority Mail Cart and into the monitor room where Casey rats him out. Suddenly, Loki's body becomes distorted as he jumps to another point in the TVA where Casey does recognize him. This keeps happening to Loki as he comes to find out he is jumping from the past and present. 