## LANGCHAIN

## IMPORTS

In [17]:
from langchain_community.document_loaders import TextLoader,PyPDFLoader,WebBaseLoader,ArxivLoader,WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

## STEP-1: DATA INGESTION USING DOCUMENT LOADERS

In [5]:
# text loader
loader_text = TextLoader('document.txt')
text_document = loader_text.load()
print(text_document)

[Document(metadata={'source': 'document.txt'}, page_content='Off-spin bowling is one of the most subtle and intellectually demanding arts in the game of cricket. Unlike fast bowling, which relies on pace and physical force, off-spin is built on control, deception, and deep tactical awareness. An off-spinner aims to outthink the batter, using flight, turn, drift, and variations in pace to create opportunities for dismissal. Though it may appear gentle compared to express pace, off-spin has been responsible for some of the greatest moments and match-winning performances in cricket history.\n\nOff-spin is delivered by a right-arm bowler who imparts spin on the ball using the fingers, causing it to turn from the off side to the leg side when bowling to a right-handed batter. This direction of spin is known as “off-break.” The bowler typically uses the index finger as the main source of rotation, rolling it down the side of the ball at release. The grip is crucial: the ball rests lightly in

In [55]:
# pdf loader
loader_pdf = PyPDFLoader('document.pdf')
pdf_document = loader_pdf.load()
print(pdf_document)



In [10]:
# web based loader
loader_web = WebBaseLoader('https://en.wikipedia.org/wiki/Agentic_AI')
web_document = loader_web.load()
print(web_document)



In [14]:
# arxiv loader
loader_arxiv = ArxivLoader(query='1706.03762',load_max_docs=2)
arxiv_document = loader_arxiv.load()
print(arxiv_document)

[Document(metadata={'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation 

In [16]:
# wikipedia loader
loader_wikipedia = WikipediaLoader(query="Mumbai",load_max_docs=1)
wikipedia_document = loader_wikipedia.load()
print(wikipedia_document)

[Document(metadata={'title': 'Mumbai', 'summary': "Mumbai ( muum-BY; Marathi: Mumbaī, pronounced [ˈmumbəi] ), also known as Bombay ( bom-BAY; its official name until 1995), is the capital city of the Indian state of Maharashtra. Mumbai is the financial capital and the most populous city proper of India with an estimated population of 12.5 million (1.25 crore). Mumbai is the centre of the Mumbai Metropolitan Region, which is among the most populous metropolitan areas in the world with a population of over 23 million (2.3 crore). Mumbai lies on the Konkan coast on the west coast of India and has a deep natural harbour. In 2008, Mumbai was named a alpha world city. Mumbai has the highest number of billionaires out of any city in Asia.\nThe seven islands that constitute Mumbai were earlier home to communities of Marathi language-speaking Koli people. For centuries, the seven islands of Bombay were under the control of successive indigenous rulers before being ceded to the Portuguese Empire

## STEP-2: DATA SPLITTING USING SPLITTERS

In [52]:
# recursive character text-splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
# split_documents() uses list of documents
chunked_pdf = text_splitter.split_documents(pdf_document)
# splitting text into chunks in document type
off_spin_bowling_info = ""
with open('document.txt') as f:
    off_spin_bowling_info = f.read()
# create_documents() works on list of strings
chunked_text = text_splitter.create_documents([off_spin_bowling_info])
# split_text() works on a single string
chunked_text_2 = text_splitter.split_text(off_spin_bowling_info)