# LOAD

In [1]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader

loader = TextLoader("speech.txt")
text_document = loader.load()
text_document


[Document(page_content="Mr. Speaker, Mr. Vice President, Members of Congress, my fellow Americans:\n\nTonight marks the eighth year that I’ve come here to report on the State of the Union. And for this final one, I’m going to try to make it a little shorter. (Applause.) I know some of you are antsy to get back to Iowa. (Laughter.) I've been there. I'll be shaking hands afterwards if you want some tips. (Laughter.)\n\nAnd I understand that because it’s an election season, expectations for what we will achieve this year are low. But, Mr. Speaker, I appreciate the constructive approach that you and the other leaders took at the end of last year to pass a budget and make tax cuts permanent for working families. So I hope we can work together this year on some bipartisan priorities like criminal justice reform -- (applause) -- and helping people who are battling prescription drug abuse and heroin abuse. (Applause.) So, who knows, we might surprise the cynics again.\n\nBut tonight, I want to

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [3]:
# Web based loader

from langchain_community.document_loaders import WebBaseLoader
import bs4

# Load, chunk, and index the content of the html page
loader = WebBaseLoader(web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
                       bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_=("post-title", "post-content", "post-header")
                       ))
)
text_document = loader.load()
text_document

[Document(page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final re

In [4]:
# PDF reader
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path="langchain-assist.pdf")
pdf_document = loader.load()
pdf_document

[Document(page_content='                                                                                                               e-ISSN: 2582 -5208 \nInternatio nal  Research Journal  of  Modernization in Engineering Technology  and  Science  \n( Peer -Reviewed, Open Access, Fully Refereed International Journal )  \nVolume:05/Issue:07/July -2023                   Impact Factor - 7.868                                    www .irjmets.com                        \nwww.irjmets.com                               @International Research Journal of Modernization in Engineering, Technology and Science  \n [2796 ] \nLANGCHAIN -POWERED VIRTUAL ASSISTANT FOR PDF  \nCOMMUNICATION  \nNR Tejaswini*1, Vidya S*2, Dr. T Vijaya Kumar*3 \n*1Student , Master Of Computer Applications , Bangalore Institute Of Technology , Bangalore , India . \n*2Assistant  Professor , Master Of Computer Applications , Bangalore Institute Of  \nTechnology , Bangalore , India . \n*3Professor , Master Of Computer Applicati

# TRANSFORM

In [5]:
# Divide the pdf document into chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(pdf_document)
documents

[Document(page_content='e-ISSN: 2582 -5208 \nInternatio nal  Research Journal  of  Modernization in Engineering Technology  and  Science  \n( Peer -Reviewed, Open Access, Fully Refereed International Journal )  \nVolume:05/Issue:07/July -2023                   Impact Factor - 7.868                                    www .irjmets.com                        \nwww.irjmets.com                               @International Research Journal of Modernization in Engineering, Technology and Science  \n [2796 ] \nLANGCHAIN -POWERED VIRTUAL ASSISTANT FOR PDF  \nCOMMUNICATION  \nNR Tejaswini*1, Vidya S*2, Dr. T Vijaya Kumar*3 \n*1Student , Master Of Computer Applications , Bangalore Institute Of Technology , Bangalore , India . \n*2Assistant  Professor , Master Of Computer Applications , Bangalore Institute Of  \nTechnology , Bangalore , India .', metadata={'source': 'langchain-assist.pdf', 'page': 0}),
 Document(page_content='*2Assistant  Professor , Master Of Computer Applications , Bangalore Ins

In [6]:
# Vector Embeddings and Vector Store

# Chroma Vectore Database
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(documents[:20], OpenAIEmbeddings()) 


In [9]:

query = "Who are the authors of this research paper?"
result = db.similarity_search(query)
result

[Document(page_content='e-ISSN: 2582 -5208 \nInternatio nal  Research Journal  of  Modernization in Engineering Technology  and  Science  \n( Peer -Reviewed, Open Access, Fully Refereed International Journal )  \nVolume:05/Issue:07/July -2023                   Impact Factor - 7.868                                    www .irjmets.com                        \nwww.irjmets.com                               @International Research Journal of Modernization in Engineering, Technology and Science  \n [2800 ] \nHere we got our output for our 1st and 2nd query which is “What is Cloud Computing?” and “What are the \nArchitectural styles based on independent components?” our Large Language Model went through file and gave \nan accurate result on the query given.  \n \nFigure 7 . The output we got for Different Question', metadata={'page': 4, 'source': 'langchain-assist.pdf'}),
 Document(page_content='e-ISSN: 2582 -5208 \nInternatio nal  Research Journal  of  Modernization in Engineering Technology

In [10]:
# FAISS Vector database
from langchain_community.vectorstores import FAISS

db_1 = FAISS.from_documents(documents[:20], OpenAIEmbeddings())


In [11]:
query = "Who are the authors of this research paper?"
result = db_1.similarity_search(query)
result

[Document(page_content='e-ISSN: 2582 -5208 \nInternatio nal  Research Journal  of  Modernization in Engineering Technology  and  Science  \n( Peer -Reviewed, Open Access, Fully Refereed International Journal )  \nVolume:05/Issue:07/July -2023                   Impact Factor - 7.868                                    www .irjmets.com                        \nwww.irjmets.com                               @International Research Journal of Modernization in Engineering, Technology and Science  \n [2800 ] \nHere we got our output for our 1st and 2nd query which is “What is Cloud Computing?” and “What are the \nArchitectural styles based on independent components?” our Large Language Model went through file and gave \nan accurate result on the query given.  \n \nFigure 7 . The output we got for Different Question', metadata={'source': 'langchain-assist.pdf', 'page': 4}),
 Document(page_content='e-ISSN: 2582 -5208 \nInternatio nal  Research Journal  of  Modernization in Engineering Technology