In [1]:
#Ref: Langchain in your pocket, Mehul Gupta
#pip install chromadb==0.4.18 langchain==0.0.349 openai==1.3.8 tiktoken==0.5.2 jq youtube-transcript-api pytube unstructured pypdfium2
import sys
import os
sys.path.append('/home/vino/api_keys')
from api_key import OPENAI_API_KEY, OWM_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import JSONLoader, YoutubeLoader, PyPDFium2Loader,TextLoader 
from langchain.document_loaders.csv_loader import CSVLoader

In [3]:
### Loading a text file using TextLoader.
### reprocessing
### Embeddings using OpenAIEmbeddings. We can use other models, such as BERT as well
### Embeddings are stored in Chroma, which is a vector db.
### Using the RetrieverQA chain, build a Retriever.
### 4 type of documnet chain: stuff, Map Reduce, Refine, Map re rank

#### RAG

In [4]:
#Document Loader
loader = TextLoader("./data/dummy_data.txt")
data = loader.load()

In [5]:
#Document Transformer
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [6]:
#Text embedding generation
embeddings = OpenAIEmbeddings()
llm=OpenAI(model_name="gpt-3.5-turbo-instruct",temperature=0)

In [7]:
#Vector DB
docsearch = Chroma.from_documents(texts, embeddings)

In [8]:
#Retriever
qa=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=docsearch.as_retriever())

In [9]:
qa.run('explain the attached file')

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


" I'm sorry, I cannot see an attached file. Could you please provide more information or clarify your question?"

In [10]:
### query other file formats

In [11]:
# #CSV
# loader = CSVLoader(file_path='./data/dummy_data.csv')
# data = loader.load()

In [12]:
# #PDF
# loader = PyPDFium2Loader("./data/dummy_data.pdf")
# data = loader.load()

In [13]:
# #YouTube
# loader = YoutubeLoader.from_youtube_url(
# "https://www.youtube.com/watch?v=D0S2YOVyFUE",
#     add_video_info=True,
#     language=["en", "id"],
#     translation="en",
# )
# data = loader.load()

In [14]:
# #JSON
# loader = JSONLoader(
#     file_path='dummy_data.json',
#     text_content=False,
#     jq_schema='.[].name')

# data = loader.load()

#### Multi-Document RAG

In [15]:
from langchain.agents.agent_types import AgentType
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.agents import AgentExecutor, initialize_agent
from langchain.memory import ConversationBufferMemory
from langchain.tools import BaseTool, StructuredTool, tool

In [16]:
#Text embedding generation
embeddings = OpenAIEmbeddings()
llm=OpenAI(model_name="gpt-3.5-turbo-instruct",temperature=0)

In [17]:
def retriever_qa_creation(file_name):
    loader = TextLoader(file_name)
    documents = loader.load()
    text_splitter=CharacterTextSplitter(chunk_size=100,chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    embeddings = OpenAIEmbeddings()
    db = Chroma.from_documents(texts, embeddings)
    qa=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", retriever=db.as_retriever())

    return qa

In [18]:
retriever_qa1 = retriever_qa_creation('./data/dummy_data.txt')
retriever_qa2 = retriever_qa_creation('./data/dummy_data2.txt')

Created a chunk of size 259, which is longer than the specified 100
Created a chunk of size 4976, which is longer than the specified 100
Created a chunk of size 701, which is longer than the specified 100
Created a chunk of size 534, which is longer than the specified 100


In [19]:
@tool
def medium_tips(query: str)->str:
    """search to extract tips and tricks to write blogs on Medium"""
    return retriever_qa1.run(query)

In [20]:
@tool
def blog_tips(query: str)->str: 
    """explains the pros and cons of writing blogs as a data scientist"""
    return retriever_qa2.run(query)

In [21]:
tools =[medium_tips,blog_tips]

In [22]:
memory = ConversationBufferMemory(memory_key="chat_history")
agent_chain=initialize_agent(tools,llm,agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, verbose=True,memory=memory)

In [23]:
agent_chain.run({'input':'Why should Data Scientists blog??'})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: blog_tips
Action Input: Why should Data Scientists blog?[0m
Observation: [33;1m[1;3m Data scientists should blog to share their knowledge and insights with others in the field, to showcase their skills and expertise, and to contribute to the larger data science community. Blogging can also help data scientists improve their communication and writing skills, as well as stay updated on the latest trends and developments in the field. Additionally, blogging can help data scientists build their personal brand and establish themselves as thought leaders in the industry.[0m
Thought:[32;1m[1;3m Do I need to use a tool? No
AI: Blogging can also be a great way for data scientists to network and connect with others in the field, as well as potential employers or clients. It can also be a valuable addition to a data scientist's portfolio and resume.[0m

[1m> Finished chain.[0m


"Blogging can also be a great way for data scientists to network and connect with others in the field, as well as potential employers or clients. It can also be a valuable addition to a data scientist's portfolio and resume."

In [24]:
agent_chain.run({'input':'who is vinothkumar'})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: medium_tips
Action Input: who is vinothkumar[0m
Observation: [36;1m[1;3m Vinothkumar is a person who has a Doctor of Philosophy in Computational Science and has experience in data-driven problem solving in both academic and industrial settings. They are proficient in managing large-scale production data and performing analytics, and have skills in statistics, machine learning, deep learning, and other technical areas. They have also published research articles and mentored others in the development of machine learning models for engineering applications.[0m
Thought:[32;1m[1;3m Do I need to use a tool? No
AI: Vinothkumar is a highly skilled data scientist with a strong background in computational science and experience in various technical areas such as statistics and machine learning. They have also published research articles and mentored others in the development of machin

'Vinothkumar is a highly skilled data scientist with a strong background in computational science and experience in various technical areas such as statistics and machine learning. They have also published research articles and mentored others in the development of machine learning models.'