In [7]:
transcript_urls = [
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q4/Management%20Commentary/Transcript%20of%20the%20Q4%202024-25%20Earnings%20Conference%20Call%20held%20at%201900%20hrs%20IST%20on%20Apr%2010,%202025.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q3/Management%20Commentary/Transcript%20of%20the%20Q3%202024-25%20Earnings%20Conference%20Call%20held%20on%20Jan%2009,%202025.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q2/Management%20Commentary/Transcript%20of%20the%20Q2%202024-25%20Earnings%20Conference%20Call%20held%20on%20Oct%2010,%202024.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q1/Management%20Commentary/Transcript%20of%20the%20Q1%202024-25%20Earnings%20Conference%20Call%20held%20on%20Jul%2011,%202024.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2025-26/q1/Management%20Commentary/Transcript%20of%20the%20Q1%202025-26%20Earnings%20Conference%20Call%20held%20at%201900%20hrs%20IST%20on%20Jul%2010,%202025.pdf",
]

press_releases = [
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2025-26/q1/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q1/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q2/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q3/IND%20AS/Press%20Release%20-%20INR.pdf",
    "https://www.tcs.com/content/dam/tcs/investor-relations/financial-statements/2024-25/q4/IND%20AS/Press%20Release%20-%20INR.pdf",
]

In [None]:
import re
import requests
from pathlib import Path


# Function to extract Q and FY from URL
def extract_info(url):
    match = re.search(r"/(\d{4}-\d{2})/(q\d)/", url, re.IGNORECASE)
    if match:
        fy = match.group(1)
        quarter = match.group(2).upper()
        return quarter, f"FY{fy}"
    else:
        raise ValueError(f"Could not parse fiscal year and quarter from URL: {url}")


# Generic download function
def download_pdf(url, save_dir):
    quarter, fy = extract_info(url)
    filename = f"{quarter}_{fy}.pdf"
    folder = Path("..") / save_dir
    folder.mkdir(exist_ok=True)
    filepath = folder / filename

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/pdf",
    }
    response = requests.get(url=url, headers=headers)
    response.raise_for_status()

    with open(filepath, "wb") as f:
        f.write(response.content)

    print(f"Downloaded: {filepath}")


# Download all files
for url in transcript_urls:
    download_pdf(url, "Transcripts")

for url in press_releases:
    download_pdf(url, "Press Releases")

Downloaded: ..\Transcripts\Q4_FY2024-25.pdf
Downloaded: ..\Transcripts\Q3_FY2024-25.pdf
Downloaded: ..\Transcripts\Q2_FY2024-25.pdf
Downloaded: ..\Transcripts\Q1_FY2024-25.pdf
Downloaded: ..\Transcripts\Q1_FY2025-26.pdf
Downloaded: ..\Press Releases\Q1_FY2025-26.pdf
Downloaded: ..\Press Releases\Q1_FY2024-25.pdf
Downloaded: ..\Press Releases\Q2_FY2024-25.pdf
Downloaded: ..\Press Releases\Q3_FY2024-25.pdf
Downloaded: ..\Press Releases\Q4_FY2024-25.pdf


In [14]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

speaker_regex = r"\n[A-Z][a-zA-Z\s\.]*?:\s*\n"

splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=500,
    separators=[speaker_regex],
    is_separator_regex=True,
    length_function=len,
)


def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text_content = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_content += page.get_text()
    return text_content


# Define the directory containing PDFs
transcripts_dir = Path("..") / "Transcripts"
docs = []

for pdf_file in transcripts_dir.glob("*.pdf"):
    try:
        text = extract_text_from_pdf(pdf_file)
        chunks = splitter.split_text(text)
        for chunk in chunks:
            docs.append(
                Document(
                    page_content=chunk,
                    metadata={"file": str(pdf_file.name).replace(".pdf", "")},
                )
            )
    except Exception as e:
        print(f"Failed to process {pdf_file}: {e}")

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams
from dotenv import load_dotenv
from uuid import uuid4

load_dotenv()

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)


collection_name = "transcripts"

client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)


uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(documents=docs, ids=uuids)

['c1e39649-194e-430f-be35-9899ce110735',
 '3ac03173-b7a0-43ab-bc66-e360e0e22d07',
 '624e41bb-d6ed-4d2f-aae3-e5e22c5823a9',
 'de63682a-2a53-4970-b62d-d787e1b1da2a',
 'ec132fdd-ccfc-424c-a4c2-b30e02392ee4',
 '1cc3ed7a-d02a-4459-977e-c9f9c26a10d6',
 '726b3699-0572-4406-b344-069322038aa8',
 '44ca8dfd-3b5c-421d-98c0-4db77d99629b',
 '9057fa94-1373-4d53-b63e-23a79ff35001',
 'fd0ca2ba-c73b-4d39-b668-0b6a97140bc4',
 '3f4f7995-ed09-46ae-a49e-5eac42183346',
 '8abbfe4f-7d0e-4a37-b79d-bb55d8634c79',
 '6620d349-6293-40b1-8fca-09f939ee0cd4',
 'f0387998-2903-4e6b-827d-51aa936b4c5d',
 'efa04240-5251-42b4-8730-23602890e55e',
 '3f500aa8-a951-4abd-acc6-7222c1e345e3',
 '94dc5fab-2d4b-447a-a3b9-a99057d7845b',
 '710d9dfe-2a0b-4fdb-bdfe-8c603f175cae',
 '03200e9b-7019-4aa6-a65e-b13960c79b2e',
 'b43c8ede-f1ae-4805-a539-377fa758787a',
 '3430d0c5-a70e-465e-94e0-052e124e624f',
 '38ecf175-e80f-4142-8f9d-95ee5f4a0599',
 '4b83d86a-e191-4e24-9ec9-c5371e863236',
 'a63cd569-4c58-43b6-a22f-e9ece43050f5',
 '01110888-65d4-

In [28]:
vector_store.similarity_search_with_score(
    query="What themes have management repeatedly emphasized in recent calls?", k=3
)

[(Document(metadata={'file': 'Q1_FY2024-25', '_id': '8abbfe4f-7d0e-4a37-b79d-bb55d8634c79', '_collection_name': 'transcripts'}, page_content="Kumar Rakesh: \nGreat. \nK Krithivasan: \nTherefore, we have to stabilize and scale in the service, before we are able to \ngive some metrics. \nKumar Rakesh: \nSure. Makes sense. Thanks. One question on the BFSI side. So, it was \nencouraging to see that BFSI has now returned to growth. So, is that a \nreflection of absence of last two quarters of furlough? Or you are seeing some \nrecovery on the ground in terms of client engagement as well? \nK Krithivasan: \nI would say there is some positive movement because as I was telling \nsomebody else, our BFSI North America has done relatively well, which is not \nreally impacted by furlough. So, this is BFSI performance to a great extent is, \nI would say, North America performance as well. \nKumar Rakesh: \nGot it. Thanks a lot. \n \n           Tata Consultancy Services Earnings Conference Call \nJu

In [None]:
hits = client.query_points(
    collection_name="transcripts",
    query=embeddings.embed_query("What are the key executive statements?"),
    limit=3,
    query_filter=models.Filter(
        should=[
            models.FieldCondition(
                key="metadata.file", match=models.MatchValue(value="Q1_FY2024-25")
            )
        ]
    ),
).points

results = []
for hit in hits:
    results.append({"content": hit.payload["page_content"], "score": hit.score})

In [52]:
results

[{'text': "K Krithivasan: \nHi, good evening, good morning, everyone. \nNehal Shah: \nMr. Samir Seksaria, Chief Financial Officer. \nSamir Seksaria: \nHello, everyone. \nNehal Shah: \nAnd Mr. Milind Lakkad, Chief HR Officer. \nMilind Lakkad: \nHi, everyone. \nNehal Shah: \nOur management team will give a brief overview of the company's performance \nfollowed by a Q&A session. As you are aware, we don't provide specific \nrevenue or earnings guidance. And anything said on this call, which reflects \n \n           Tata Consultancy Services Earnings Conference Call \nJuly 11, 2024, 19:00 hrs IST  \n \n \n | 2   \n \nour outlook for the future or which could be construed as a forward-looking \nstatement must be reviewed in conjunction with the risks that the company \nfaces. We have outlined these risks in second slide of the quarterly fact sheet \navailable on our website and e-mail out to those who have subscribed on our \nmailing list.  \nWith that, I would like to turn the call over to