In [None]:
transcript_urls = [
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q4/Management%20Commentary/Transcript%20of%20the%20Q4%202024-25%20Earnings%20Conference%20Call%20held%20at%201900%20hrs%20IST%20on%20Apr%2010,%202025.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q3/Management%20Commentary/Transcript%20of%20the%20Q3%202024-25%20Earnings%20Conference%20Call%20held%20on%20Jan%2009,%202025.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q2/Management%20Commentary/Transcript%20of%20the%20Q2%202024-25%20Earnings%20Conference%20Call%20held%20on%20Oct%2010,%202024.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q1/Management%20Commentary/Transcript%20of%20the%20Q1%202024-25%20Earnings%20Conference%20Call%20held%20on%20Jul%2011,%202024.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2025-26/q1/Management%20Commentary/Transcript%20of%20the%20Q1%202025-26%20Earnings%20Conference%20Call%20held%20at%201900%20hrs%20IST%20on%20Jul%2010,%202025.pdf",
]

press_releases = [
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2025-26/q1/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q1/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q2/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q3/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q4/IND%20AS/Press%20Release%20-%20INR.pdf",
]

In [None]:
import re
import requests
from pathlib import Path


# Function to extract Q and FY from URL
def extract_info(url):
    match = re.search(r"/(\d{4}-\d{2})/(q\d)/", url, re.IGNORECASE)
    if match:
        fy = match.group(1)
        quarter = match.group(2).upper()
        return quarter, f"FY{fy}"
    else:
        raise ValueError(f"Could not parse fiscal year and quarter from URL: {url}")


# Generic download function
def download_pdf(url, save_dir):
    quarter, fy = extract_info(url)
    filename = f"{quarter}_{fy}.pdf"
    folder = Path("..") / save_dir
    folder.mkdir(exist_ok=True)
    filepath = folder / filename

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/pdf",
    }
    response = requests.get(url=url, headers=headers)
    response.raise_for_status()

    with open(filepath, "wb") as f:
        f.write(response.content)

    print(f"Downloaded: {filepath}")


# Download all files
for url in transcript_urls:
    download_pdf(url, "Transcripts")

for url in press_releases:
    download_pdf(url, "Press Releases")

# For transcripts

In [None]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

speaker_regex = r"\n[A-Z][a-zA-Z\s\.]*?:\s*\n"

splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=500,
    separators=[speaker_regex],
    is_separator_regex=True,
    length_function=len,
)


def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text_content = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_content += page.get_text()
    return text_content


# Define the directory containing PDFs
transcripts_dir = Path("..") / "Transcripts"
docs = []

for pdf_file in transcripts_dir.glob("*.pdf"):
    try:
        text = extract_text_from_pdf(pdf_file)
        chunks = splitter.split_text(text)
        for chunk in chunks:
            docs.append(
                Document(
                    page_content=chunk,
                    metadata={"file": str(pdf_file.name).replace(".pdf", "")},
                )
            )
    except Exception as e:
        print(f"Failed to process {pdf_file}: {e}")

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams
from dotenv import load_dotenv
from uuid import uuid4

load_dotenv()

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)


collection_name = "transcripts"

client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)


uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=uuids)

In [None]:
vector_store.similarity_search_with_score(
    query="What themes have management repeatedly emphasized in recent calls?", k=3
)

In [None]:
hits = client.query_points(
    collection_name="transcripts",
    query=embeddings.embed_query("What are the key executive statements?"),
    limit=3,
    query_filter=models.Filter(
        should=[
            models.FieldCondition(
                key="metadata.file", match=models.MatchValue(value="Q1_FY2024-25")
            )
        ]
    ),
).points

results = []
for hit in hits:
    results.append({"content": hit.payload["page_content"], "score": hit.score})

# For Press Releases
(contains tables and headings that can imorove chunks if retained)

In [29]:
import pymupdf4llm
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)
from langchain.docstore.document import Document

from pathlib import Path


def extract_md_text(pdf_file):
    md_text = pymupdf4llm.to_markdown(pdf_file)
    return md_text


text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500)

reports_dir = Path("..") / "Press Releases"
report_docs = []

for pdf_file in reports_dir.glob("*.pdf"):
    try:
        md_text = extract_md_text(pdf_file)
        md_chunks = text_splitter.split_text(md_text)
        for chunk in md_chunks:
            report_docs.append(
                Document(
                    page_content=chunk,
                    metadata={"file": str(pdf_file.name).replace(".pdf", "")},
                )
            )
    except Exception as e:
        print(f"Failed to process {pdf_file}: {e}")

In [34]:
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams
from dotenv import load_dotenv
from uuid import uuid4

load_dotenv()

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)


collection_name = "reports"

client = QdrantClient(url="http://localhost:6333")

In [37]:
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)


uuids = [str(uuid4()) for _ in range(len(report_docs))]
vector_store.add_documents(documents=report_docs, ids=uuids)

['a49e3cee-70f6-4e31-868f-930aaea52c36',
 'c6130ea5-04ec-4caa-b9b0-1cd61a218b12',
 '308fae41-ec5e-4d9b-bd6e-0478bd389ec3',
 '9d3f6f03-2fad-4dfb-950b-ce80e925033a',
 '2ac3588f-ec1f-4521-9ebf-c0a277e20337',
 '66443645-76bf-46d3-89cc-77638b7c4d90',
 '3b582f89-f067-4e88-877d-9f65ba739288',
 '4faa17dc-5329-4dcc-8b37-b64dc4aec5d5',
 'd3dff70d-ec2a-4dc6-85e5-d66e328ea9e0',
 'cb6dd731-dcc0-4d13-a7ab-1d073b956a9d',
 '0c7d9ced-4d9c-4f29-b779-bd1879d24c38',
 '59021c0f-665b-44c6-8290-666aeae80e02',
 'e4adba24-636f-4b77-a740-e91af19f58a4',
 '4bb15eab-4029-4391-a954-dde4077cdb38',
 '8232d345-d78e-4a62-83e7-98b6b0fff71d',
 '0a1b3d8e-1e2d-4a1d-8c84-bce500dac66c',
 '2334af0f-1b8a-4860-9df4-4b5318bd7431',
 'f37f7fd2-eda9-4b58-a2c2-3cf955f49fd4',
 '0b61768d-dd6a-45a5-ad97-8b298bf1eb59',
 'a2f9fbd7-6dad-45bf-96a8-b5a161f0015d',
 'f64ade72-4e66-4e20-9585-8805bf123733',
 '3a0ca0a2-c008-4724-9531-e8c21cecc8c5',
 '1b23e29d-6df0-4d92-8a05-886d6a194a1a',
 '82efc78c-45e2-47b2-942e-ca06a1ba04ca',
 '78ea88c7-6dee-

# Generating snapshots

In [43]:
client.create_full_snapshot(wait=True)

SnapshotDescription(name='full-snapshot-2025-08-02-21-54-03.snapshot', creation_time='2025-08-02T21:54:03', size=5056000, checksum='0746f0578996824575432a6cbd49a358c697cda50964e57ee723cdadef2cb5f0')