In [2]:
import os
from langchain_community.document_loaders import WebBaseLoader
import bs4
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from dotenv import load_dotenv




In [3]:
load_dotenv()

True

#### INDEXING ####

In [4]:
loader = WebBaseLoader(
    web_paths=[
        "https://www.moneycontrol.com/"
    ],
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            attrs={
                "class": [
                    "post-content",
                    "post-title",
                    "post-header"
                ]
            }
        )
    ),
)    

In [5]:
docs = loader.load()

In [6]:
text_splitter = SemanticChunker(OpenAIEmbeddings())
splits = text_splitter.split_documents(docs)

In [7]:
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()


#### RETRIEVAL and GENERATION ####

In [8]:

prompt = hub.pull("jclemens24/rag-prompt")
def format_docs(docs) :
    return "\n\n".join(doc.page_content for doc in docs)
llm=ChatOpenAI(model_name="gpt-4o-mini")    

In [9]:
rag_chain = (
    {"context":retriever | format_docs,
    "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [13]:
rag_chain.invoke("Nifty 50")

'Nifty 50 is a stock market index that represents the weighted average of 50 of the largest and most actively traded stocks on the National Stock Exchange of India (NSE). It is considered a benchmark for the Indian equity market and helps investors gauge the overall performance of the market.'