<a href="https://colab.research.google.com/github/yys-4/google-project/blob/main/chat_documents_w_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install PyPDF2 langchain dspy==0.1.5 dspy-ai[faiss-cpu]

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting dspy==0.1.5
  Downloading dspy-0.1.5-py3-none-any.whl.metadata (692 bytes)
Collecting dspy-ai[faiss-cpu]
  Downloading dspy_ai-2.6.10-py3-none-any.whl.metadata (286 bytes)
Collecting dspy-ai==2.4.5 (from dspy==0.1.5)
  Downloading dspy_ai-2.4.5-py3-none-any.whl.metadata (36 kB)
Collecting backoff~=2.2.1 (from dspy-ai==2.4.5->dspy==0.1.5)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting joblib~=1.3.2 (from dspy-ai==2.4.5->dspy==0.1.5)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting ujson (from dspy-ai==2.4.5->dspy==0.1.5)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting datasets<3.0.0,~=2.14.6 (from dspy-ai==2.4.5->dspy==0.1.5)
  Downloading datasets-2.14.7-py3-none-any.whl.metadata (19 kB)
Collecting optuna (from dspy-ai==2.4.5->dspy==0.1.5)
  Downloading optuna-4.2.1-py3-none-any.wh

In [17]:
from PyPDF2 import PdfReader

reader = PdfReader("/content/Copy of Sosialisasi Hibah Penelitian dan PkM Dana RKAT Kampus UPI Purwakarta_organized.pdf")
complete_text = ""

for page in reader.pages:
  text = page.extract_text()
  complete_text += text

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a chunk size, to splitting the documents.
    chunk_size=1024,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

# Contains all the chunked documents of the complete_text_book
texts = text_splitter.create_documents([complete_text])


In [19]:
# An array containing the page_content of each and every document
page_contents = [text.page_content for text in texts]

In [20]:
from dspy.retrieve.faiss_rm import FaissRM
frm = FaissRM(page_contents)

In [21]:
import dspy

# The gemini LM for the project
gemini = dspy.Google(model='gemini-1.5-flash', api_key="AIzaSyBdI1PVB6vpl_Dy67GjtJnckXX0pkVKw8w", temperature=0.3)

# Configuring the dspy with the LM and RM
dspy.settings.configure(lm=gemini, rm=frm)

In [22]:
class GenerateAnswer(dspy.Signature):
    """You are an ai chat with document agent, whose purpose is to understand the context provided, and based on that generate the answer as per the query of the user.
       The context that is being provided is from the document, which you have to understand to answer the query of the user.

       If you dont have context matching to the query of the user, you can politely state that the query can't be well answered using the document provided.
    """

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField(desc="The query of the user")
    answer = dspy.OutputField()

In [23]:
class RAG(dspy.Module):
    def __init__(self, num_passages=5):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)

    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [24]:
r = RAG()
response = r("Give me a summary of the document in 100 words in bahasa indonesia")
response.answer

'Dokumen tersebut merangkum sosialisasi hibah penelitian dan PkM dana RKAT Kampus UPI Purwakarta tahun 2024.  Sosialisasi pada 14 Maret 2024 menjelaskan tiga skema pendanaan: penguatan kelompok bidang keilmuan, penelitian kebijakan, dan pengabdian masyarakat. Dana unggulan kamda (penelitian Rp 40jt, PkM Rp 25jt) dapat diakses melalui berbagai skema yang tercantum dalam surat edaran.  Proposal diajukan melalui https://litabmas.upi.edu/ (13-22 Maret 2024). Zulfa Nur Anisa sebagai notulen dan Prof. Dr. Yayan Nurbayan memimpin rapat.'