In [1]:
%pip install --upgrade langchain langchain-openai langchain-community langchain-text-splitters


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pickle
import time
from typing import Iterable, List

import streamlit as st
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.globals import set_debug

# Maintain backwards-compatible name used in older tutorials
OpenAI = ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.vectorstores import FAISS


class SimpleRetrievalQAWithSourcesChain:
    """Minimal RetrievalQA chain compatible with latest LangChain packages."""

    def __init__(
        self,
        llm: ChatOpenAI,
        retriever,
        prompt: ChatPromptTemplate | None = None,
    ) -> None:
        self.llm = llm
        self.retriever = retriever
        self.prompt = prompt or ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    "You are a careful research assistant. Answer using only the"
                    " supplied context. Cite sources in parentheses using the"
                    " provided source names. If context is empty, say you cannot"
                    " answer.",
                ),
                (
                    "human",
                    "Context:\n{context}\n\nQuestion: {question}\n\nAnswer in"
                    " 4-6 sentences and include a Sources line listing the"
                    " source names you used.",
                ),
            ]
        )

    @staticmethod
    def _format_docs(docs: Iterable[Document]) -> str:
        chunks = []
        for idx, doc in enumerate(docs, start=1):
            source = doc.metadata.get("source") or doc.metadata.get("url")
            label = source or f"Chunk-{idx}"
            chunks.append(f"Source: {label}\n{doc.page_content}")
        return "\n\n".join(chunks)

    @staticmethod
    def _collect_sources(docs: Iterable[Document]) -> List[str]:
        seen: List[str] = []
        for doc in docs:
            candidate = doc.metadata.get("source") or doc.metadata.get("url")
            if candidate and candidate not in seen:
                seen.append(candidate)
        return seen

    @classmethod
    def from_chain_type(cls, llm: ChatOpenAI, retriever, **kwargs):
        prompt = kwargs.get("prompt") or kwargs.get("chain_type_kwargs", {}).get("prompt")
        return cls(llm=llm, retriever=retriever, prompt=prompt)

    def invoke(self, inputs):
        if isinstance(inputs, str):
            question = inputs
        elif isinstance(inputs, dict):
            question = inputs.get("question") or inputs.get("query")
            if not question:
                raise ValueError("Provide a 'question' or 'query'.")
        else:
            raise TypeError("Inputs must be a string or dict containing 'question'.")

        if hasattr(self.retriever, "get_relevant_documents"):
            docs = self.retriever.get_relevant_documents(question)
        else:
            docs = self.retriever.invoke(question)
        context = self._format_docs(docs)
        messages = self.prompt.format_messages(context=context, question=question)
        response = self.llm.invoke(messages)
        answer_text = response.content if hasattr(response, "content") else str(response)
        return {"answer": answer_text.strip(), "sources": self._collect_sources(docs)}

    def __call__(self, inputs, **_):
        # Accept unused kwargs (e.g., return_only_outputs) for compatibility
        return self.invoke(inputs)







In [None]:
#load openAI api key
os.environ['OPENAI_API_KEY'] = 'your_open_api_key'

In [4]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500) 

# (1) Load data

In [5]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

# (2) Split data to create chunks

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [7]:
len(docs)

18

In [8]:
docs[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹50 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nNetwork 18\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nMoneycontrol\n\nGo PRO NowPRO\n\nMoneycontrol PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending Topic

# (3) Create embeddings for these chunks and save them to FAISS index

In [9]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [10]:
# Storing vector index create in local
vector_store_dir = "faiss_index"
vectorindex_openai.save_local(vector_store_dir)
print(f"Saved FAISS index to {vector_store_dir}/")

Saved FAISS index to faiss_index/


In [11]:
# Reload FAISS index when running in a fresh session
vectorIndex = vectorindex_openai
vector_store_dir = "faiss_index"
if os.path.isdir(vector_store_dir):
    vectorIndex = FAISS.load_local(
        vector_store_dir,
        embeddings,
        allow_dangerous_deserialization=True,
    )



# (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [12]:
chain = SimpleRetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    retriever=vectorIndex.as_retriever(search_kwargs={"k": 4}),
)
chain

<__main__.SimpleRetrievalQAWithSourcesChain at 0x256242ee310>

In [13]:
query = "what is the price of Tiago iCNG?"
# query = "what are the main features of punch iCNG?"

set_debug(True)

chain({"question": query})

[32;1m[1;3m[llm/start][0m [1m[llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "System: You are a careful research assistant. Answer using only the supplied context. Cite sources in parentheses using the provided source names. If context is empty, say you cannot answer.\nHuman: Context:\nSource: https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html\nThe company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nfirst published: Aug 4, 2023 02:17 pm\n\nDiscover the latest Business News, Sensex, and Nifty updates. Obtain Perso

{'answer': 'The price of Tiago iCNG ranges from Rs 6.55 lakh to Rs 8.1 lakh, as mentioned in the article from Moneycontrol on Tata Motors launching the Punch iCNG variant (Source: Moneycontrol). This pricing makes the Tiago iCNG an affordable option for customers looking for a CNG vehicle. Additionally, Tata Motors has introduced twin-cylinder technology on the Tiago model, enhancing its performance and fuel efficiency (Source: Moneycontrol). This move by Tata Motors aims to offer a diverse range of CNG vehicles to cater to different customer preferences and needs.',
 'sources': ['https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html']}