In [None]:
import fitz  # PyMuPDF
import requests


def parse_pdf_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/pdf",
        "Referer": "https://www.bseindia.com/",
    }

    response = requests.get(url=url, headers=headers)

    if response.status_code == 200:
        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        doc = fitz.open("temp.pdf")

        text_content = ""
        for page_num in range(1, len(doc)):  # skipping the first page
            page = doc.load_page(page_num)
            text_content += page.get_text()

        return text_content

    else:
        raise Exception(f"Failed to retrieve PDF. Status code: {response.status_code}")


# Example usage
url = "https://www.bseindia.com/xml-data/corpfiling/AttachHis/9cee0fdb-07a3-4dc6-af7a-40dce51e1348.pdf"
try:
    pdf_text = parse_pdf_from_url(url)
except Exception as e:
    print(e)

In [73]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

speaker_regex = r"\n[A-Z][a-zA-Z\s\.]*?:\s*\n"

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=500,
    separators=[speaker_regex],
    is_separator_regex=True,
    length_function=len,
)

In [74]:
chunks = splitter.split_text(pdf_text)

In [75]:
chunks[1]

"Nehal Shah: \nThank you, operator. Good evening, and welcome, everyone. Thank \nyou for joining us today to discuss TCS' financial results for the first \nquarter of fiscal year FY 2026 that ended on June 30, 2025. This call is \nbeing webcast through our website and an archive, including the \ntranscript, will be available on the site for the duration of this quarter. \nThe financial statements, quarterly fact sheet and press releases are \nalso available on our website. \nOur leadership team is present on this call to discuss our results. We \nhave with us today Mr. K Krithivasan, Chief Executive Officer and \nManaging Director. \nK Krithivasan: \nHello, everyone. \nNehal Shah: \nMs. Aarthi Subramanian, Executive Director, President and Chief \nOperating Officer. \nAarthi Subramanian:Good evening, everyone. \nNehal Shah: \nMr. Samir Seksaria, Chief Financial Officer."

In [None]:
import re
from langchain.docstore.document import Document


def extract_speakers_from_chunk(text: str) -> list:
    # Match lines that look like "K Krithivasan:", "Nehal Shah:", etc.
    pattern = re.compile(r"(?:^|\n)([A-Z][a-zA-Z\s\.]{1,50}):(?=\s)", re.MULTILINE)
    return list(set(pattern.findall(text)))


def extract_pages_from_chunk(text: str) -> list:
    """Extracts all unique page numbers from a chunk."""
    pattern = re.compile(r"Page\s+(\d+)\s+of\s+\d+", re.IGNORECASE)
    return sorted(set(map(int, pattern.findall(text))))


docs = []
for chunk in chunks:
    speakers = extract_speakers_from_chunk(chunk)
    pages = extract_pages_from_chunk(chunk)
    docs.append(
        Document(
            page_content=chunk,
            metadata={"speakers": speakers, "pages_referenced": pages},
        )
    )

In [78]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

collection_name = "transcripts"

client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

In [79]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]
_ = vector_store.add_documents(documents=docs, ids=uuids)