# Testing YFinance

In [23]:
import yfinance as yf

ticker = yf.Lookup("TCS").get_all(count=1).index[0]

In [27]:
ticker

'TCS.NS'

In [48]:
yf.Ticker(ticker).history(period="1qtr", interval="1mo")

TCS.NS: Period '1qtr' is invalid, must be one of: 1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, 10y, ytd, max


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


# Fetching reports

In [66]:
import fitz  # PyMuPDF
import requests


def parse_pdf_from_url(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept": "application/pdf",
        "Referer": "https://www.bseindia.com/",
    }

    response = requests.get(url=url, headers=headers)

    if response.status_code == 200:
        with open("temp.pdf", "wb") as f:
            f.write(response.content)

        doc = fitz.open("temp.pdf")

        text_content = ""
        for page_num in range(1, len(doc)):  # skipping the first page
            page = doc.load_page(page_num)
            text_content += page.get_text()

        return text_content

    else:
        raise Exception(f"Failed to retrieve PDF. Status code: {response.status_code}")


# Example usage
url = "https://www.bseindia.com/xml-data/corpfiling/AttachHis/9cee0fdb-07a3-4dc6-af7a-40dce51e1348.pdf"
try:
    pdf_text = parse_pdf_from_url(url)
except Exception as e:
    print(e)

In [67]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

speaker_regex = r"\n[A-Z][a-zA-Z\s\.]*?:\s*\n"

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=500,
    separators=[speaker_regex],
    is_separator_regex=True,
    length_function=len,
)

In [68]:
chunks = splitter.split_text(pdf_text)

In [69]:
chunks[1]

"Nehal Shah: \nThank you, operator. Good evening, and welcome, everyone. Thank \nyou for joining us today to discuss TCS' financial results for the first \nquarter of fiscal year FY 2026 that ended on June 30, 2025. This call is \nbeing webcast through our website and an archive, including the \ntranscript, will be available on the site for the duration of this quarter. \nThe financial statements, quarterly fact sheet and press releases are \nalso available on our website. \nOur leadership team is present on this call to discuss our results. We \nhave with us today Mr. K Krithivasan, Chief Executive Officer and \nManaging Director. \nK Krithivasan: \nHello, everyone. \nNehal Shah: \nMs. Aarthi Subramanian, Executive Director, President and Chief \nOperating Officer. \nAarthi Subramanian:Good evening, everyone. \nNehal Shah: \nMr. Samir Seksaria, Chief Financial Officer."

In [70]:
import re
from langchain.docstore.document import Document


def extract_speakers_from_chunk(text: str) -> list:
    # Match lines that look like "K Krithivasan:", "Nehal Shah:", etc.
    pattern = re.compile(r"(?:^|\n)([A-Z][a-zA-Z\s\.]{1,50}):(?=\s)", re.MULTILINE)
    return list(set(pattern.findall(text)))


def extract_pages_from_chunk(text: str) -> list:
    """Extracts all unique page numbers from a chunk."""
    pattern = re.compile(r"Page\s+(\d+)\s+of\s+\d+", re.IGNORECASE)
    return sorted(set(map(int, pattern.findall(text))))


docs = []
for chunk in chunks:
    speakers = extract_speakers_from_chunk(chunk)
    pages = extract_pages_from_chunk(chunk)
    docs.append(
        Document(
            page_content=chunk,
            metadata={"speakers": speakers, "pages_referenced": pages},
        )
    )

In [72]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings

# embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
from langchain_openai import OpenAIEmbeddings

from dotenv import load_dotenv

load_dotenv()

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
)

In [73]:
len(embeddings.embed_query("hi"))

1536

In [74]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams


collection_name = "transcripts"

client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

In [75]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]
_ = vector_store.add_documents(documents=docs, ids=uuids)

In [81]:
vector_store.similarity_search_with_score(
    query="Moderator: \nThank you very much.",
)

[(Document(metadata={'speakers': ['Moderator', 'Note'], 'pages_referenced': [], '_id': '86dd72d6-4d7f-4111-8dc0-0070665fbfb3', '_collection_name': 'transcripts'}, page_content='Moderator: \nThank you, members of the management. On behalf of TCS, that \nconcludes this conference call. Thank you for joining us, and you may \nnow disconnect your lines. \n \nNote:  \nThis transcript has been edited for readability and does not purport to \nbe a verbatim record of the proceedings.'),
  0.5302013),
 (Document(metadata={'speakers': ['Ravi Menon', 'Thanks so much. And best of luck. \nModerator', 'K Krithivasan'], 'pages_referenced': [], '_id': 'b04489a3-379a-4b60-9be3-6d894ab4d12a', '_collection_name': 'transcripts'}, page_content="Ravi Menon: \nOne last question. On the high-tech side, that's where we had seen the \nspending cut start. Now that seems to be trending up. And this is where \nwe've also seen some of your peers have to pass on significant \nproductivity improvements. Looks like yo

In [76]:
s1 = """We are expanding to Asia and Europe, with a focus on sustainable practices. Our new product line will be launched in Q3 2024, targeting eco-conscious consumers. We are also investing in AI-driven analytics to enhance customer experience and operational efficiency."""
s2 = """What will be the initiatives to bring in more customers?"""
s3 = """Growth strategy and green efforts"""

In [77]:
vector1 = embeddings.embed_query(s1)
vector2 = embeddings.embed_query(s2)
vector3 = embeddings.embed_query(s3)

In [78]:
# cosine distances between vectors
import numpy as np


def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


similarity_12 = cosine_similarity(vector1, vector2)
similarity_13 = cosine_similarity(vector1, vector3)
similarity_23 = cosine_similarity(vector2, vector3)

print(f"Cosine similarity between vector1 and vector2: {similarity_12}")
print(f"Cosine similarity between vector1 and vector3: {similarity_13}")
print(f"Cosine similarity between vector2 and vector3: {similarity_23}")

Cosine similarity between vector1 and vector2: 0.3829831452294622
Cosine similarity between vector1 and vector3: 0.4764788957875816
Cosine similarity between vector2 and vector3: 0.3798601238376507
