# Data Loading

In [None]:
#Install html2text
%pip install html2text==2024.2.26
#Installing bs4 package
%pip install bs4==0.0.2 
#Installing lxml
%pip install lxml==5.2.2 
# Install the Sentence Transformers library
%pip install sentence_transformers==2.7.0 
# Install FAISS-CPU
%pip install faiss-cpu #==1.8.0
%pip install faiss-gpu
%pip install langchain_huggingface
%pip install ragas

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Importing the AsyncHtmlLoader
from langchain_community.document_loaders import AsyncHtmlLoader

#This is the url of the wikipedia page on the 2023 Cricket World Cup
url="https://en.wikipedia.org/wiki/2023_Cricket_World_Cup"

#Invoking the AsyncHtmlLoader
loader = AsyncHtmlLoader (url)

#Loading the extracted information
data = loader.load()



#Import Html2TextTransformer
from langchain_community.document_transformers import Html2TextTransformer

#Assign the Html2TextTransformer function
html2text = Html2TextTransformer()

#Call transform_documents
data_transformed = html2text.transform_documents(data)

print(data_transformed[0].page_content)

# Data Splitting

In [None]:
# Import the HTMLHeaderTextSplitter library
from langchain_text_splitters import HTMLHeaderTextSplitter

# Set url as the Wikipedia page link
url="https://en.wikipedia.org/wiki/2023_Cricket_World_Cup"

# Specify the header tags on which splits should be made
headers_to_split_on=[
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4")
]

# Create the HTMLHeaderTextSplitter function
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Create splits in text obtained from the url
html_header_splits = html_splitter.split_text_from_url(url)

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)

chunks = text_splitter.split_documents(html_header_splits)

# Data Conversion (Embeddings)

In [None]:
# Import HuggingFaceEmbeddings from embeddings library
from langchain_community.embeddings import HuggingFaceEmbeddings

# Instantiate the embeddings model. The embeddings model_name can be changed as desired
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2")

# Create embeddings for all chunk
chunk_embedding = embeddings.embed_documents([chunk.page_content for chunk in chunks])

# Storage

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
# Import FAISS class from vectorstore library
from langchain_community.vectorstores import FAISS

# Create the database
db=FAISS.from_documents(chunks,hf)

# Synthetic test dataset generation

In [None]:
#Importing the AsyncHtmlLoader
from langchain_community.document_loaders import AsyncHtmlLoader

#This is the url of the wikipedia page on the 2023 Cricket World Cup
url="https://en.wikipedia.org/wiki/2023_Cricket_World_Cup"

#Instantiating the AsyncHtmlLoader
loader = AsyncHtmlLoader (url)

#Loading the extracted information
data = loader.load()

from langchain_community.document_transformers import Html2TextTransformer

#Instantiate the Html2TextTransformer function
html2text = Html2TextTransformer()

#Call transform_documents
data_transformed = html2text.transform_documents(data)

# Import necessary libraries
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Instantiate the models
generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# Create the TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Call the generator
testset = generator.generate_with_langchain_docs(
data_transformed, 
test_size=20, 
distributions={ 
simple: 0.5, 
reasoning: 0.25, 
multi_context: 0.25}
)

# Evaluations

# Build a RAG agent with CrewAI

In [None]:
%pip install crewai crewai_tools

In [None]:
from crewai import Crew, Task, Agent, LLM
from crewai_tools import RagTool

In [None]:
llm = LLM(model="openai/gpt-4", max_tokens=1024)

In [None]:
config = {
    "llm": {
        "provider": "openai", 
        "config": {
            "model": "gpt-4",
        }
    },
    "embedding_model": {
        "provider": "openai",
        "config": {
            "model": "text-embedding-ada-002"
        }
    }
}

In [None]:

rag_tool = RagTool(config=config,  
    chunk_size=1200,       
    chunk_overlap=200,     
)
rag_tool.add("Structured_AI_development.pdf", data_type="pdf_file")
#"model_cards/facebook/bart-large-mnli/README.md"