# Load Data Source

In [1]:
## Data Ingestion steps

from langchain_community.document_loaders import TextLoader
loader = TextLoader("c:/Users/vamsh/OneDrive/Desktop/Langchain/Langchain/rag/speech.txt")
text_documents = loader.load()
text_documents

[Document(metadata={'source': 'c:/Users/vamsh/OneDrive/Desktop/Langchain/Langchain/rag/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\nâ€¦\n\nIt will be all the easier for us to conduct ourselves

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACKING"]="true"
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")


In [4]:
from langchain_community.document_loaders import WebBaseLoader
from bs4 import SoupStrainer


loader = WebBaseLoader(web_path="https://en.wikipedia.org/wiki/Radio",
                       bs_kwargs={"parse_only": SoupStrainer("p")})

text_documents = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
text_documents

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Radio'}, page_content='Radio is the technology of communicating using radio waves.[1][2][3] Radio waves are electromagnetic waves of frequency between 3\xa0hertz (Hz) and 300\xa0gigahertz (GHz). They are generated by an electronic device called a transmitter connected to an antenna which radiates oscillating electrical energy, often characterized as a wave. They can be received by other antennas connected to a radio receiver; this is the fundamental principle of radio communication. In addition to communication, radio is used for radar, radio navigation, remote control, remote sensing, and other applications.\nIn radio communication, used in radio and television broadcasting, cell phones, two-way radios, wireless networking, and satellite communication, among numerous other uses, radio waves are used to carry information across space from a transmitter to a receiver, by modulating the radio signal (impressing an information s

In [6]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("c:/Users/vamsh/OneDrive/Desktop/Langchain/Langchain/rag/attention.pdf")
docs = loader.load()
docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'c:/Users/vamsh/OneDrive/Desktop/Langchain/Langchain/rag/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naid

# Transformation

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents1 = text_splitter.split_documents(docs)
documents1

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'c:/Users/vamsh/OneDrive/Desktop/Langchain/Langchain/rag/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naid

# Vector Embeddings and Vector Store 

In [9]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(documents1[:20], OpenAIEmbeddings())

In [16]:
## Vector database

query="An attention function can be described as mapping a query "
result = db.similarity_search(query)
result[0].page_content

'3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3'

In [18]:
#FAISS vector Database

from langchain_community.vectorstores import FAISS
db1 = FAISS.from_documents(documents1[:20], OpenAIEmbeddings())
db1

<langchain_community.vectorstores.faiss.FAISS at 0x22d60657500>

In [19]:
query="An attention function can be described as mapping a query "
result = db1.similarity_search(query)
result[0].page_content

'3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3'