In [None]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

In [None]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [None]:
extracted_data = load_pdf("data/")

In [None]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [None]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

In [None]:
#download embedding model
def initialise_embeddings():
    embeddings = OpenAIEmbeddings(model_name="text-embedding-3-large")
    return embeddings

In [None]:
embeddings = initialise_embeddings()

In [None]:
vectorstores = Chroma.from_documents(documents=extracted_data, 
                                     embedding=embeddings)

retriever = vectorstores.as_retriever(search_kwarg={'k':1})

In [None]:
custom_prompt_template = """You are a climate expert, good a reading documents. given the context from a document, use the information to answer the user's questions

Context: {context}
Questions: {question}

only return the helpful answer below, nothing else
Helpful answer:
"""

In [None]:
def set_custom_prompt():
    prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context', 'question'])

prompt = set_custom_prompt()

In [None]:
from langchain_openai import ChatOpenAI
chat_model = ChatOpenAI(temperature = 0, model_name = 'gpt-4o')

qa = RetrievalQA.from_chain_type(llm = chat_model, 
                                 chain_type = 'stuff', 
                                 retriever = retriever, 
                                 chain_type_kwargs = {"prompt":prompt})

In [None]:
response = qa.invoke({"query": "Tell me above US climate policies"})
print(response['result'])