In [15]:
#importing necessary libraries
import os
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
import nest_asyncio
import hashlib
import json
load_dotenv()

True

In [12]:
#Llama Parsing
parser = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="markdown"  # "markdown" and "text" are available
)
file_extractor = {".pdf": parser}
nest_asyncio.apply()

In [13]:
#Open AI setup
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#Pinecone setup
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

#Pinecone index name
index_name = 'ai-tutor'
#Add the embeddings to the index
#Did this if statement because earlier I was re-running the code and it was outputting error so I added this if statement to check if the index is already created or not
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )


In [16]:
#Hashing file setup
UPLOAD_DOCS_FILE = 'upload_docs.json'


def get_file_hash(file_path):
    #Generate a hash for the file to use as an unique indentifier
    with open(file_path, "rb") as f:
        #Hashlib.md5() makes that file don't have to be stored in memory
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)
    return file_hash.hexdigest()


#Load the file hashes
def load_uploaded_dos():
    #Load the list of upload documents
    if os.path.exists(UPLOAD_DOCS_FILE):
        with open(UPLOAD_DOCS_FILE, 'r') as f:
            return json.load(f)
    return {}


#save the file hashes
def save_uploaded_docs(uploaded_docs):
    with open(UPLOAD_DOCS_FILE, 'w') as f:
        json.dump(uploaded_docs, f)


#Processing the PDF file
def process_pdf(file_path, index_name, pc):
    #Get the file hash
    file_hash = get_file_hash(file_path)
    #Check if the file is already uploaded
    upload_file_docs = load_uploaded_dos()

    if file_hash in upload_file_docs:
        print(f"File {file_path} has already been processed. Using existing index.")
        return get_existing_index(index_name,pc)

    #If the file has not been upserted then we will process the file
    documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data()
    vector_store = PineconeVectorStore(pinecone_index=pc.Index(index_name))
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

    #Update the file hash
    upload_file_docs[file_hash] = {'file_path': file_path, 'index': index.to_dict()}
    save_uploaded_docs(upload_file_docs)

    return index

#Get existing index
def get_existing_index(index_name, pc):
    vector_store = PineconeVectorStore(pinecone_index=pc.Index(index_name))
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return VectorStoreIndex.from_vector_store(vector_store=vector_store,storage_context=storage_context)


#Generate a response 
def generate_response(query, index):
    retriever = index.as_retriever(similarity_top_k=3)
    retrieved_nodes = retriever.retrieve(query)
    context='\n'.join([node.node.text for node in retrieved_nodes])
    prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"
    completion = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": "You are a knowledgeable AI tutor. Use the provided context to answer the question."},
            {"role": "user", "content": prompt}
        ]
    )

    return completion.choices[0].message.content


#This function will be called in future NextJS API route to process the PDF
def handle_pdf(filepath):
    return process_pdf(filepath, index_name, pc)


#This function will be called from API route as well, as the output
def handle_query(query):
    index = get_existing_index(index_name, pc)
    response = generate_response(query, index)
    return response


#Testing if it works
def test_pdf_processing_and_query():
    handle_pdf('test_parsing/deeplearningbook.org_contents_part_basics.html.pdf')
    print('\nTesting query...')
    query = "Can you give me first 3 sentences of the deep learning book?"
    response = handle_query(query)
    print(f"Query: {query}")
    print(f"Response: {response}")



if __name__ == "__main__":
    test_pdf_processing_and_query()

File test_parsing/deeplearningbook.org_contents_part_basics.html.pdf has already been processed. Using existing index.

Testing query...
Query: Can you give me first 3 sentences of the deep learning book?
Response: This part of the book introduces the basic mathematical concepts needed to understand deep learning. We begin with general ideas from applied math that enable us to define functions of many variables, find the highest and lowest points on these functions, and quantify degrees of belief. Next, we describe the fundamental goals of machine learning.
