### Import all Libraries 

In [57]:
import certifi
from pymongo import MongoClient
import openai
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import csv
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
import numpy as np

from sklearn.ensemble import RandomForestClassifier


In [44]:
# Please set the openai.api_key to the API key
openai.api_key = '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'

def get_database():
    """
    Purpose: Get the database from MongoDB
    """
    uri = "mongodb+srv://sriveerisetti:SuperAnimal@saveanimal.caz0ya1.mongodb.net/?retryWrites=true&w=majority&appName=SaveAnimal"
    ca = certifi.where()
    client = MongoClient(uri, tlsCAFile=ca)
    db = client['OpenSaveAnimal']  # Make sure to use the correct database name
    return db

In [None]:
def chunk_words(text, chunk_size=100):
    """
    Purpose: Split the text into chunks of the specified size.
    Input: text - the text to be split
    Input: chunk_size - the size of the chunks
    """
    words = text.split()
    # Iterate over the words and yield chunks of the specified size
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i+chunk_size])

def generate_embedding(text, model="text-embedding-ada-002"):
    """
    Purpose: Generate an embedding for the specified text using the openai.Embedding API.
    Input: text - the text for which to generate the embedding
    Input: model - the model to use for generating the embedding
    """
    response = openai.Embedding.create(
        input=[text],  #
        model=model
    )
    embedding = response['data'][0]['embedding']
    return embedding

def store_text_with_embedding(text, source, collection):
    """
    Purpose: Store the text and its embedding in the database.
    Input: text - the text to store
    Input: source - the source of the text
    Input: collection - the collection in which to store the text
    """
    for chunk in chunk_words(text):
        chunk_embedding = generate_embedding(chunk)
        collection.insert_one({
            "chunk": chunk,
            "embedding": chunk_embedding,
            "source": source
        })
    print(f"Content from {source} has been successfully stored in MongoDB.")

def process_text_files(folder_path, collection):
    """
    Purpose: Process all the text files in the specified folder and store the text and its embedding in the database.
    Input: folder_path - the path to the folder containing the text files
    Input: collection - the collection in which to store the text
    """
    # The for loop goes through each file in the folder and processes the text files
    for filename in os.listdir(folder_path):
        # Capture all the text files in the folder
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            # Here we open the txt file and read the content
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read()
                # Here we use the store_text_with_embedding function to store the text and its embedding in the database
                store_text_with_embedding(text_content, filename, collection)

# The main function calls the process_text_files function to process the text files and store the text and its embedding in the database
if __name__ == "__main__":
    db = get_database()
    collection = db['SaveAnimal']  
    folder_path = "/Users/sveerisetti/Desktop/Duke_Spring/Deep_Learning/Projects/Invidual_Project/Notebooks/Animal_Information"
    process_text_files(folder_path, collection)


In [51]:
def find_most_relevant_chunks(query, top_k=5):
    """
    Purpose: Find the most relevant chunks for the specified query using the cosine similarity.
    Input: query - the user's query
    Input: top_k - the number of most relevant chunks to return
    """
    db = get_database()
    collection = db['SaveAnimal']
    # Here we can use the generate_embedding function to generate an embedding for the query
    query_embedding = np.array(generate_embedding(query)).reshape(1, -1) 
    docs = collection.find({})

    similarities = []
    for doc in docs:
        chunk_embedding = np.array(doc['embedding']).reshape(1, -1)  
        # We can use the cosine similarity to find the similarity between the query and the chunk
        similarity = cosine_similarity(chunk_embedding, query_embedding)[0][0]
        similarities.append((doc['chunk'], similarity, doc.get('source')))

    similarities.sort(key=lambda x: x[1], reverse=True)
    seen_chunks = set()
    unique_similarities = []
    # We create a for loop that goes through the similarities and stores the unique chunks
    for chunk, similarity, source in similarities:
        if chunk not in seen_chunks:
            seen_chunks.add(chunk)
            # For all unique chunks, we store the chunk, similarity, and source
            unique_similarities.append((chunk, similarity, source))
            if len(unique_similarities) == top_k:
                break
    return unique_similarities

def generate_prompt_with_context(relevant_chunks, query):
    """
    Purpose: Generate a prompt with the context of the most relevant chunks and the user's query.
    Input: relevant_chunks - the most relevant chunks
    Input: query - the user's query
    """
    # Here, we are creating a prompt that includes the context of the most relevant chunks and the user's query
    context = "Based on the following information: "
    # Here we iterate over the most relevant chunks and add them to the context
    for chunk, similarity, source in relevant_chunks:
        context += f"\n- [Source: {source}]: {chunk}"
    prompt = f"{context}\n\n{query}"
    return prompt

def generate_text_with_gpt35(prompt, max_tokens=3100, temperature=0.7):
    """
    Purpose: Generate text using the GPT-3.5 model with adjustable randomness.
    Input: prompt - the prompt for the model
    Input: max_tokens - the maximum number of tokens to generate
    Input: temperature - controls the randomness of the output, higher values lead to more varied outputs
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert on endangered species."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        # To increase randomness, we can adjust the temperature parameter
        temperature=temperature,  
        n=1,
        stop=None
    )
    return response.choices[0].message['content'].strip()

def get_response_for_query(query, temperature=0.9):
    """
    Purpose: Get a response for the specified query, allowing temperature adjustment for output variability.
    Input: query - the user's query
    Input: temperature - controls the randomness of the output, higher values lead to more varied outputs
    """
    # We can use the find_most_relevant_chunks function to find the most relevant chunks for the query
    relevant_chunks = find_most_relevant_chunks(query)
    if relevant_chunks:
        # Here we use the generate_prompt_with_context function to generate a prompt with the context of the most relevant chunks
        prompt = generate_prompt_with_context(relevant_chunks, query)
    else:
        prompt = query  
    # Here we use the generate_text_with_gpt35 function to generate text using the GPT-3.5 model
    return generate_text_with_gpt35(prompt, temperature=temperature)

In [52]:
test = get_response_for_query(
"The orangutan is a broad category that encompasses several distinct species, each with its unique adaptations, habitats, and challenges. Provide an overview of the diversity within the orangutan species, detailing the subspecies known, including their physical characteristics, geographical distribution, and the conservation challenges, and endangered status. Highlight the differences and similarities among these subspecies to give a comprehensive understanding of the species' ecological and conservation status. Please also provide how the WWF proposes to help the orangutan species and the conservation efforts in place to protect the orangutan species."
) 

In [53]:
print(test)

Orangutans are a fascinating and critically endangered species of great apes found in the rainforests of Borneo and Sumatra. There are three distinct species of orangutans, each with its unique physical characteristics, geographical distribution, and conservation challenges.

1. Bornean Orangutans:
- There are three subspecies of Bornean orangutans: Northwest, Northeast, and Central.
- Northwest Bornean orangutans are the most threatened subspecies, with only around 1,500 individuals remaining. Their habitat has been severely affected by logging and hunting.
- Northeast Bornean orangutans are the smallest in size and are found in Sabah and eastern Kalimantan.
- Central Bornean orangutans have the highest population, with at least 35,000 individuals.
- Conservation challenges for Bornean orangutans include habitat loss due to logging, hunting, and conversion of forests to agriculture.

2. Sumatran Orangutans:
- The range of Sumatran orangutans is restricted to the northern part of Sumat