In [1]:
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
from dotenv import load_dotenv, find_dotenv

In [3]:
load_dotenv(find_dotenv())

True

In [4]:
import os

In [6]:
os.environ["GROQ_API_KEY"] =  os.getenv("GROQ_API_KEY")

In [7]:
import bs4

### **Data ingestion**

In [10]:
loader = WebBaseLoader(
    web_paths=("https://www.datanetiix.com/leadership.php",),
    bs_kwargs=dict(
        #filter specific parts of the webpage, improving efficiency.
        parse_only=bs4.SoupStrainer(
            class_=("new-leadership-section", "modal fade leader-modal lead-modal")
        )
    ),
)

In [11]:
docs=loader.load()

In [12]:
docs[0].metadata

{'source': 'https://www.datanetiix.com/leadership.php'}

In [15]:
print(docs[0].page_content)








President/Co-Founder


Sasi Kannan
President/Co-Founder







Founder/CEO


Bala Sriraghavan
Founder/CEO







Principal, CISO


Rafael Linares
Principal, CISO







Sr. Director - Delivery/Offshore Head


Sornalingam Kadirvelan Ravi (Ravi)
Sr. Director - Delivery/Offshore Head









Sr. VP – Enterprise Sales


Phillip Van Poole
Sr. VP – Enterprise Sales







Director - Sales & Marketing


Anantha Krishnan, MBA
Director - Sales & Marketing










Phillip Van Poole
Sr. VP – Enterprise Sales
×



Phillip Van Poole boasts a remarkable career spanning over 32 years in sales and marketing management, with roots tracing back to his early education in the vibrant entertainment industry of New York City. Mentored by Mrs. Jan Burger at Youth Activists Now (YAN), Van Poole's journey commenced at age 11, where he showcased exceptional business acumen within the recording industry. His tenure at Atlantic Records marked a significant milestone, becoming the youngest National Direc

In [16]:
llm=ChatGroq(model="llama-3.3-70b-versatile")

In [17]:
model_name="BAAI/bge-small-en"

In [18]:
model_kwargs={"device": "cpu"}

In [19]:
encode_kwargs={"normalize_embeddings": True}

In [20]:
hf_embeddings=HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  hf_embeddings=HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm





In [21]:
docs[0].page_content

"\n\n\n\n\n\n\nPresident/Co-Founder\n\n\nSasi Kannan\nPresident/Co-Founder\n\n\n\n\n\n\n\nFounder/CEO\n\n\nBala Sriraghavan\nFounder/CEO\n\n\n\n\n\n\n\nPrincipal, CISO\n\n\nRafael Linares\nPrincipal, CISO\n\n\n\n\n\n\n\nSr. Director - Delivery/Offshore Head\n\n\nSornalingam Kadirvelan Ravi (Ravi)\nSr. Director - Delivery/Offshore Head\n\n\n\n\n\n\n\n\n\nSr. VP – Enterprise Sales\n\n\nPhillip Van Poole\nSr. VP – Enterprise Sales\n\n\n\n\n\n\n\nDirector - Sales & Marketing\n\n\nAnantha Krishnan, MBA\nDirector - Sales & Marketing\n\n\n\n\n\n\n\n\n\n\nPhillip Van Poole\nSr. VP – Enterprise Sales\n×\n\n\n\nPhillip Van Poole boasts a remarkable career spanning over 32 years in sales and marketing management, with roots tracing back to his early education in the vibrant entertainment industry of New York City. Mentored by Mrs. Jan Burger at Youth Activists Now (YAN), Van Poole's journey commenced at age 11, where he showcased exceptional business acumen within the recording industry. His tenu

In [22]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [23]:
len(splits)

16

In [24]:
splits = text_splitter.split_documents(docs)

In [25]:
vectorstore = FAISS.from_documents(documents=splits,embedding=hf_embeddings)

In [26]:
retriever=vectorstore.as_retriever()

In [27]:
len(docs[0].page_content)

9993

### **Data Retrival Pipeline**

In [28]:
prompt = hub.pull("rlm/rag-prompt")

In [29]:
import pprint

In [30]:
prompt.messages

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]

In [31]:
pprint.pprint(prompt.messages)

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


In [32]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [33]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [34]:
from langchain_core.runnables import RunnablePassthrough

In [35]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

### **Data generation**

In [37]:
rag_chain.invoke("who is the co founder alone")

"The co-founder mentioned is Sasi Kannan, who is the President/Co-Founder. There is no other co-founder mentioned, only a founder, Bala Sriraghavan. I don't know if there are any other co-founders beyond Sasi Kannan."