### Import Libraries

In [None]:
import os
import re
import csv
import certifi
import requests
import numpy as np
import pandas as pd
import pymongo
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import openai

# Import the OpenAI API key
openai.api_key = ''

### Append Chunks to MongoDB

In [None]:
def get_database():
    """
    Purpose: Establish a connection to the MongoDB database.
    """
    uri = "mongodb+srv://sriveerisetti:8TkNOyysCO4S3lBo@chatbot.w3bjnk6.mongodb.net/?retryWrites=true&w=majority&appName=Chatbot"
    ca = certifi.where()
    # The MongoClient is used to establish a connection to the database.
    client = MongoClient(uri, tlsCAFile=ca)
    # The database that contains the information is Chatbot
    db = client['Chatbot'] 
    return db

def embed_message(user_message):
    """
    Purpose: Embed the user's message using the GIST-large model.
    Input: user_message - The message that the user has entered.
    """
    # We ended up using Hugging Face's SentenceTransformer library to embed the user's message.
    # The GIST embedding model is on the leaderboard on Hugging Face
    model = SentenceTransformer("avsolatorio/GIST-large-Embedding-v0")
    # We use the encode functoin to embed the user's message
    query_embedding = model.encode([user_message], convert_to_tensor=True).tolist()[0]
    return query_embedding

def chunk_words(text, chunk_size=150, overlap=25):
    """
    Purpose: Split the text into chunks of a specified size with a specified overlap.
    Input: text - The text to split into chunks.
    Input: chunk_size - The size of each chunk (150).
    Input: overlap - The number of words to overlap between chunks (25).
    """
    # The purpose of this code is to make sure that the chunk size remains firmly at 150 words.
    # We use 25 words from the previous chunk to overlap with the next chunk and then 25 words from the next chunk to overlap with the previous chunk.
    # In total there are 150 words in each chunk.
    words = text.split()
    chunks = []
    start = 0 
    # Here we are making sure that the chunk size is 150 words.
    while start + chunk_size - 2 * overlap < len(words):
        # We are making sure that the start is at 0.
        if start == 0:
            actual_start = start
            # We are making sure that the end is at 150 words.
            actual_end = start + chunk_size - overlap  
        else:
            actual_start = start - overlap
            actual_end = start + chunk_size - overlap
        # Here we make sure tha the end is not greater than the length of the words (150)
        if actual_end + overlap > len(words):
            actual_end = len(words)  
        # Here we gather the words in the chunk.
        chunk = words[actual_start:actual_end]
        # Here we append the words in the chunk to the chunks list.
        chunks.append(' '.join(chunk))
        start += chunk_size - 2 * overlap
    # Here we are making sure that the last chunk is not greater than the length of the words.
    if start < len(words):
        # We use the max function to make sure of this.
        last_chunk_start = max(0, start - overlap)
        last_chunk = words[last_chunk_start:len(words)]
        chunks.append(' '.join(last_chunk))
    return chunks

def store_text_with_embedding(text, source, collection):
    """
    Store the text and its embedding in the database.
    :param text: The text to store
    :param source: The source of the text
    :param collection: The MongoDB collection in which to store the text
    """
    for chunk in chunk_words(text):
        chunk_embedding = embed_message(chunk)  # Use BERT-based embedding
        collection.insert_one({
            "chunk": chunk,
            "embedding": chunk_embedding,
            "source": source
        })
    print(f"Content from {source} has been successfully stored in MongoDB.")

def process_text_files(folder_path, collection):
    """
    Purpose: Process the text files in the specified folder and store the text and its embedding in the database.
    Input: folder_path - The path to the folder containing the text files.
    Input: collection - The MongoDB collection in which to store the text.
    """
    # We create a for loop that goes through the folder containing the text files.
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            # For all files within the folder we read them and use the store_text_with_embedding function to store the text and its 
            # embedding in the database.
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read()
                store_text_with_embedding(text_content, filename, collection)

if __name__ == "__main__":
    db = get_database()
    collection = db['Duke5']
    folder_path = "/content/Combo_Data"
    process_text_files(folder_path, collection)


### Get Relevant Chunks and Try With GPT 3.5 (Just to see if it is working properly)

In [None]:
def get_database():
    """
    Purpose: Establish a connection to the MongoDB database.
    """
    uri = "mongodb+srv://sriveerisetti:8TkNOyysCO4S3lBo@chatbot.w3bjnk6.mongodb.net/?retryWrites=true&w=majority&appName=Chatbot"
    ca = certifi.where()
    # The MongoClient is used to establish a connection to the database.
    client = MongoClient(uri, tlsCAFile=ca)
    # The database that contains the information is Chatbot
    db = client['Chatbot'] 
    return db

def generate_embedding(user_message):
    """
    Purpose: Embed the user's message using the GIST-large model.
    Input: user_message - The message that the user has entered.
    """
    # We ended up using Hugging Face's SentenceTransformer library to embed the user's message.
    # The GIST embedding model is on the leaderboard on Hugging Face
    model = SentenceTransformer("avsolatorio/GIST-large-Embedding-v0")
    # We use the encode functoin to embed the user's message
    query_embedding = model.encode([user_message], convert_to_tensor=True).tolist()[0]
    return query_embedding

def find_most_relevant_chunks(query, top_k=5):
    """
    Purpose: Find the most relevant chunks to the user's query.
    Input: query - the user's query
    Input: top_k - the number of most relevant chunks to return
    """
    # Here we connect to the database and also define the collection within the database 
    db = get_database()
    collection = db['Duke5']
    # We use the generate_embedding function to generate the embedding for the user's query
    query_embedding = np.array(generate_embedding(query)).reshape(1, -1)  
    docs = collection.find({})

    # Empty list to store the similarities
    similarities = []
    # For all chunks in the collection we calculate the cosine similarity between the query embedding and the document embedding
    for doc in docs:
        chunk_embedding = np.array(doc['embedding']).reshape(1, -1)  
        # We use the cosine similarity function to calculate the similarity between the query embedding and the document embedding
        similarity = cosine_similarity(chunk_embedding, query_embedding)[0][0]
        # We append the chunk, similarity and source to the similarities list
        similarities.append((doc['chunk'], similarity, doc.get('source')))

    # We sort the similarities list in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    seen_chunks = set()
    unique_similarities = []
    # We iterate over the similarities list and add the most relevant chunks to the unique_similarities list
    for chunk, similarity, source in similarities:
        if chunk not in seen_chunks:
            # If the chunk is unique, then we add it to the unique_similarities list
            seen_chunks.add(chunk)
            unique_similarities.append((chunk, similarity, source))
            if len(unique_similarities) == top_k:
                break
    return unique_similarities

def generate_prompt_with_context(relevant_chunks, query):
    """
    Purpose: Generate a prompt that includes the context of the most relevant chunks and the user's query.
    Input: relevant_chunks - the most relevant chunks to the user's query
    Input: query - the user's query
    """
    # Here, we are creating a context that includes the most relevant chunks to the user's query
    context = "Based on the following information: "
    # Here we iterate over the relevant chunks and add them to the context
    for chunk, similarity, source in relevant_chunks:
        context += f"\n- [Source: {source}]: {chunk}"
    # Here we add the user's query to the prompt 
    prompt = f"{context}\n\n{query}"
    return prompt

def generate_text_with_gpt35(prompt, max_tokens=3100, temperature=0.7):
    """
    Purpose: Generate text using the GPT-3.5 model with the specified prompt.
    Input: prompt - the prompt for the model
    Input: max_tokens - the maximum number of tokens to generate
    Input: temperature - controls the randomness of the output, higher values lead to more varied outputs
    """
    response = openai.ChatCompletion.create(
        # Here we use the GPT-3.5-turbo model to generate the text
        model="gpt-3.5-turbo",
        messages=[
            # We make sure to set the role of the system as an expert on the Duke Artificial Intelligence Master of Engineering Program
            {"role": "system", "content": "You are an expert on the Duke Artificial Intelligence Master of Engineering Program"},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature, 
        n=1,
        stop=None
    )
    return response.choices[0].message['content'].strip()

def get_response_for_query(query, temperature=0.9):
    """
    Purpose: Get a response for the user's query.
    Input: query - the user's query
    Input: temperature - controls the randomness of the output, higher values lead to more varied outputs
    """
    # We use the find_most_relevant_chunks function to find the most relevant chunks to the user's query
    relevant_chunks = find_most_relevant_chunks(query)
    if relevant_chunks:
        # We use the generate_prompt_with_context function to generate a prompt that includes the context of the most relevant chunks and the user's query
        prompt = generate_prompt_with_context(relevant_chunks, query)
    else:
        prompt = query 
    # We use the generate_text_with_gpt35 function to generate text using the GPT-3.5 model with the specified prompt
    return generate_text_with_gpt35(prompt, temperature=temperature)

### Increase the Dataset Samples: Take Curated Questions from GPT 3.5 and increase size of dataset

In [None]:
def read_questions_from_csv(file_path):
    """
    Purpose: Read questions from a CSV file.
    Input: file_path - the path to the CSV file containing the questions.
    """
    questions = []
    # Here we loop through the text file and read the questions
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        # For each row within the question, we append the question to the questions list
        for row in reader:
            # Append the question to the questions list
            questions.append(row[0])
    return questions

def rephrase_questions(questions, target_total, variations_per_question=2): 
    """
    Purpose: Rephrase the questions using the GPT-3.5 model.
    Input: questions - the list of questions to rephrase
    Input: target_total - the target total number of questions to generate
    Input: variations_per_question - the number of variations to generate per question
    """ 
    rephrased_questions = []
    # Here we calculate the number of questions needed to reach the target total
    # There are roughly 328 questions in the original questions list
    needed = max(target_total - len(questions), 0)

    for question in questions:
        # Default count variable to keep track of the number of rephrased questions
        count = 0
        # While the count is less than the variations per question and the length of the rephrased questions is less than the needed questions
        # we generate different ways to ask the question
        while count < variations_per_question and len(rephrased_questions) < needed:
            # This prompt is used to generate different ways to ask the question
            prompt_text = f"Generate {variations_per_question} different ways to ask the following question: {question}"
            try:
                response = openai.Completion.create(
                    # We use the GPT-3.5-turbo-instruct model to generate the rephrased questions
                    engine="gpt-3.5-turbo-instruct",
                    prompt=prompt_text,
                    max_tokens=100,
                    n=variations_per_question,
                    stop=None,
                    temperature=0.8
                )
                # For each rephrased question in the response, we append it to the rephrased questions list
                for rephrased in response.choices[0].text.strip().split('\n'):
                    # If the rephrased question is not empty and is not the same as the original question, we append it to the rephrased questions list
                    if rephrased.strip() and rephrased.strip() != question:
                        rephrased_questions.append(rephrased.strip())
                        # We increase the count variable tracker by 1 each time we add a rephrased question
                        count += 1
                        if len(rephrased_questions) >= needed:
                            break
            except Exception as e:
                print(f"Error processing question: {str(e)}")

    return questions + rephrased_questions 

def save_questions_to_csv(questions, output_path):
    """
    Purpose: Save the questions to a CSV file.
    Input: questions - the list of questions to save
    Input: output_path - the path to the CSV file to save the questions
    """
    # We open a new CSV file and write the questions to the file
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        # We use the csv writer to write the questions to the file
        writer = csv.writer(csvfile)
        # For all questions in the questions list, we write the question to the file
        for question in questions:
            writer.writerow([question])

# Paths for the input and output files
input_file_path = '/content/questions.csv'  
output_file_path = '/content/rephrased_questions.csv'  

# Call functions to rephrase the questions
original_questions = read_questions_from_csv(input_file_path)
# The target total is set to 700
rephrased_questions = rephrase_questions(original_questions, 700)  
# Save the rephrased questions to a CSV file
save_questions_to_csv(rephrased_questions, output_file_path)

# Track the number of questions generated
print(f"Total questions generated: {len(rephrased_questions)}")


### Creating Question/Answer Pairs for Training using GPT 3.5 Turbo

In [None]:
def get_database():
    """
    Purpose: Establish a connection to the MongoDB database.
    """
    uri = "mongodb+srv://sriveerisetti:8TkNOyysCO4S3lBo@chatbot.w3bjnk6.mongodb.net/?retryWrites=true&w=majority&appName=Chatbot"
    ca = certifi.where()
    # The MongoClient is used to establish a connection to the database.
    client = MongoClient(uri, tlsCAFile=ca)
    # The database that contains the information is Chatbot
    db = client['Chatbot'] 
    return db

def generate_embedding(user_message):
    """
    Purpose: Embed the user's message using the GIST-large model.
    Input: user_message - The message that the user has entered.
    """
    # We ended up using Hugging Face's SentenceTransformer library to embed the user's message.
    # The GIST embedding model is on the leaderboard on Hugging Face
    model = SentenceTransformer("avsolatorio/GIST-large-Embedding-v0")
    # We use the encode functoin to embed the user's message
    query_embedding = model.encode([user_message], convert_to_tensor=True).tolist()[0]
    return query_embedding

def find_most_relevant_chunks(query, top_k=5):
    """
    Purpose: Find the most relevant chunks to the user's query.
    Input: query - the user's query
    Input: top_k - the number of most relevant chunks to return
    """
    # Here we connect to the database and also define the collection within the database 
    db = get_database()
    collection = db['Duke5']
    # We use the generate_embedding function to generate the embedding for the user's query
    query_embedding = np.array(generate_embedding(query)).reshape(1, -1)  
    docs = collection.find({})

    # Empty list to store the similarities
    similarities = []
    # For all chunks in the collection we calculate the cosine similarity between the query embedding and the document embedding
    for doc in docs:
        chunk_embedding = np.array(doc['embedding']).reshape(1, -1)  
        # We use the cosine similarity function to calculate the similarity between the query embedding and the document embedding
        similarity = cosine_similarity(chunk_embedding, query_embedding)[0][0]
        # We append the chunk, similarity and source to the similarities list
        similarities.append((doc['chunk'], similarity, doc.get('source')))

    # We sort the similarities list in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    seen_chunks = set()
    unique_similarities = []
    # We iterate over the similarities list and add the most relevant chunks to the unique_similarities list
    for chunk, similarity, source in similarities:
        if chunk not in seen_chunks:
            # If the chunk is unique, then we add it to the unique_similarities list
            seen_chunks.add(chunk)
            unique_similarities.append((chunk, similarity, source))
            if len(unique_similarities) == top_k:
                break
    return unique_similarities

def generate_prompt_with_context(relevant_chunks, query):
    """
    Purpose: Generate a prompt that includes the context of the most relevant chunks and the user's query.
    Input: relevant_chunks - the most relevant chunks to the user's query
    Input: query - the user's query
    """
    # Here, we are creating a context that includes the most relevant chunks to the user's query
    context = "Based on the following information: "
    # Here we iterate over the relevant chunks and add them to the context
    for chunk, similarity, source in relevant_chunks:
        context += f"\n- [Source: {source}]: {chunk}"
    # Here we add the user's query to the prompt 
    prompt = f"{context}\n\n{query}"
    return prompt


def generate_text_with_gpt35(prompt, max_tokens=3100, temperature=0.7):
    """
    Purpose: Generate text using the GPT-3.5 model with the specified prompt.
    Input: prompt - the prompt for the model
    Input: max_tokens - the maximum number of tokens to generate
    Input: temperature - controls the randomness of the output, higher values lead to more varied outputs
    """
    response = openai.ChatCompletion.create(
        # Here we use the GPT-3.5-turbo model to generate the text
        model="gpt-3.5-turbo",
        messages=[
            # We make sure to set the role of the system as an expert on the Duke Artificial Intelligence Master of Engineering Program
            {"role": "system", "content": "You are an expert on the Duke Artificial Intelligence Master of Engineering Program"},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature, 
        n=1,
        stop=None
    )
    return response.choices[0].message['content'].strip()

def process_questions(input_file, output_file):
    """
    Purpose: Process the questions in the input file and write the answers to the output file.
    Input: input_file - the path to the input file containing the questions.
    Input: output_file - the path to the output file to write the answers.
    """
    # Here we open a new CSV file to write the answers to the questions
    with open(input_file, newline='', encoding='utf-8') as infile, \
         open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        # We use the csv reader to read the questions from the input file
        reader = csv.DictReader(infile)
        # We use the csv writer to write the answers to the output file
        # We define the field names for the CSV file
        writer = csv.DictWriter(outfile, fieldnames=['Question', 'Context', 'Answer'])
        writer.writeheader()

        # For each row in the input file, we process the question and write the answer to the output file
        for row in reader:
            # Here we clean the question by getting rid of the numbering
            clean_question = re.sub(r'^\d+\.\s+', '', row['Question'])
            # Here we print the question that is being processed to make sure that the code is running
            print(f"Processing question: '{clean_question}'")
            # Here we use the find_most_relevant_chunks function to find the most relevant chunks to the user's query
            relevant_chunks = find_most_relevant_chunks(clean_question)
            # If there are relevant chunks, we generate a prompt with the context of the most relevant chunks and the user's query
            if relevant_chunks:
                context = "\n".join(f"{chunk}" for chunk, _, source in relevant_chunks)  # also removed source prefix here
            else:
                context = "No relevant context found."
            # We use the generate_prompt_with_context function to generate a prompt that includes the context of the most relevant chunks and the user's query
            prompt = generate_prompt_with_context(relevant_chunks, clean_question)
            # We use the generate_text_with_gpt35 function to generate text using the GPT-3.5 model with the specified prompt
            answer = generate_text_with_gpt35(prompt)
            # We write the question, context, and answer to the output file
            writer.writerow({'Question': clean_question, 'Context': context, 'Answer': answer})
            # We print the answer to the question to make sure that the code is running
            print("Finished processing and writing to CSV.\n")

if __name__ == "__main__":
    process_questions('/content/rephrased_questions.csv', '/content/qaduke5.csv')
