In [4]:
import fitz
import tiktoken
from google import genai
import os

genai_client = genai.Client(vertexai=True, project="trim-sunlight-412311", location="global")

def count_pdf_tokens(file_path: str) -> (int, int, int):
    """
    Extracts text from a PDF and calculates the total number of tokens, words, and pages.

    Args:
        file_path: The path to the PDF file.

    Returns:
        A tuple containing the total token count, word count, and page count.
        Returns (0, 0, 0) if the file doesn't exist.
    """
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'")
        return 0, 0, 0

    try:
        # Open the PDF file
        doc = fitz.open(file_path)
        page_count = doc.page_count
        full_text = ""

        # Extract text from each page
        for page_num in range(page_count):
            page = doc.load_page(page_num)
            full_text += page.get_text()
        
        doc.close()

        token_count = genai_client.models.count_tokens(model = "gemini-2.5-pro", contents=full_text).total_tokens
        
        word_count = len(full_text.split())

        return token_count, word_count, page_count

    except Exception as e:
        print(f"An error occurred: {e}")
        return 0, 0, 0

pdf_path = "/home/tathagat/workspace/projects/StoryDarpan/llm-graph-builder/experiments/ramayana-english.pdf"

tokens, words, pages = count_pdf_tokens(pdf_path)
    
if tokens > 0:
    print("-" * 30)
    print(f"Analysis for: {os.path.basename(pdf_path)}")
    print(f"Total Pages: {pages}")
    print(f"Total Words: {words}")
    print(f"Total Tokens: {tokens}")
    print("-" * 30)

------------------------------
Analysis for: ramayana-english.pdf
Total Pages: 1960
Total Words: 418954
Total Tokens: 656550
------------------------------
