In [1]:
import json
from pydantic_ai import Agent
from pydantic import BaseModel, Field
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
#Get all the text files from ChaptersCleaned directory
import os
def get_text_files(directory):
    return [f for f in os.listdir(directory) if f.endswith('.txt')]
directory = 'ChaptersCleaned'
text_files = get_text_files(directory)

In [17]:
#Read all the text files and create a dictionary
documents = {}
for file in text_files:
    with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
        documents[file] = f.read()

In [18]:
#Get number of words in each document
def get_word_count(text):
    return len(text.split())
word_counts = {file: get_word_count(text) for file, text in documents.items()}
#Print the word counts
for file, count in word_counts.items():
    print(f"{file}: {count} words","Approx llm token count:", count*4/3)


#Total approx token count
total_tokens = sum(word_counts.values()) * 4 / 3
print(f"Total approx token count: {total_tokens}")

everything_about_dogs_administering_medicine.txt: 1917 words Approx llm token count: 2556.0
everything_about_dogs_breeding.txt: 7372 words Approx llm token count: 9829.333333333334
everything_about_dogs_diseases.txt: 78952 words Approx llm token count: 105269.33333333333
everything_about_dogs_distemper.txt: 12921 words Approx llm token count: 17228.0
everything_about_dogs_dog_papers.txt: 214 words Approx llm token count: 285.3333333333333
everything_about_dogs_drugs.txt: 2132 words Approx llm token count: 2842.6666666666665
everything_about_dogs_feeding.txt: 4087 words Approx llm token count: 5449.333333333333
everything_about_dogs_hydrophobia.txt: 16746 words Approx llm token count: 22328.0
everything_about_dogs_medicle_terms.txt: 787 words Approx llm token count: 1049.3333333333333
everything_about_dogs_start_note.txt: 2104 words Approx llm token count: 2805.3333333333335
Total approx token count: 169642.66666666666


In [None]:
#Create an agent which takes in the text document and tells what the document is about in points

#Create the data model for the agent
class DocumentSummary(BaseModel):
    summary: str = Field(description="Summary of the document in points")

system_prompt = """You are a helpful agent that summarizes documents. 
You will be given a document and you need to summarize it in points.
Make sure to include the document name in the summary.  
The summary should be concise and to the point. This summary would be used for Agentic RAG which will be used to answer questions about the document."""

#Create the agent
summary_agent = Agent(
    model="google-gla:gemini-2.0-flash",
    output_type=DocumentSummary,
    system_prompt=system_prompt
)

In [None]:
import asyncio
#Iterate every document and take first 8000 words approx and send it to the agent and store the summary
summarys = {}
for file, text in documents.items():
    print(f"Processing file: {file} with approx {len(text.split())} words")
    prompt = f"Summarize the following document in points:\n\n{file}\n\n{text}\n\nMake sure to include the document name in the summary."
    #Get the summary from the agent
    while True:
        try:
            result = await summary_agent.run(user_prompt=prompt)
            break  # Exit the loop if the run is successful
        except Exception as e:
            print(f"Error occurred: {e}. Retrying in 2 seconds...")
            await asyncio.sleep(2)  # Non-blocking sleep

            if "exhausted" in str(e).lower():
                print("Waiting for 120 seconds before retrying...")
                #Start a stream timer
                await asyncio.sleep(120)  # Wait for 120 seconds before retrying
                print("Resuming after waiting for 120 seconds.")

    summarys[file] = result.output.summary

Processing file: everything_about_dogs_administering_medicine.txt with approx 1917 words
Processing file: everything_about_dogs_breeding.txt with approx 7372 words
Processing file: everything_about_dogs_diseases.txt with approx 78952 words
Processing file: everything_about_dogs_distemper.txt with approx 12921 words
Processing file: everything_about_dogs_dog_papers.txt with approx 214 words
Processing file: everything_about_dogs_drugs.txt with approx 2132 words
Processing file: everything_about_dogs_feeding.txt with approx 4087 words
Processing file: everything_about_dogs_hydrophobia.txt with approx 16746 words
Processing file: everything_about_dogs_medicle_terms.txt with approx 787 words
Processing file: everything_about_dogs_start_note.txt with approx 2104 words


In [15]:
for file, summary in summarys.items():
    print(f"Summary for {file}:\n{summary}\n")

Summary for everything_about_dogs_administering_medicine.txt:
Here is a summary of the document everything_about_dogs_administering_medicine.txt in points:

*   The document provides guidance on administering medicine to dogs, emphasizing the importance of kindness and strategy.
*   It suggests tricking dogs into taking medicine naturally whenever possible and using force only when necessary.
*   Different methods are described for giving pills, liquids, and powders, including hiding pills in food, using gelatine capsules, and mixing powders with butter or syrup.
*   The document also discusses the use of injections, emphasizing the importance of using a rubber syringe.
*   It highlights the differences in how drugs affect dogs compared to humans, noting that some common substances like salt and castor oil have different effects on dogs.
*   The document advises caution with certain drugs like chloroform, strychnine, and mercury, and suggests that they should only be administered by an