Tagging and Extraction Using OpenAI functions

In [None]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:

from typing import List
from pydantic import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_function

In [4]:
#creating a pydantic model for tagging
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [5]:
convert_to_openai_function(Tagging)

{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'type': 'object',
  'properties': {'sentiment': {'type': 'string'},
   'language': {'type': 'string'}},
  'required': ['sentiment', 'language']}}

In [11]:
#Now, lets do actual tagging - we need a prompt and language model
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [12]:
#creating a model - we want to be deterministing
model = ChatOpenAI(temperature=0)

In [13]:
#create tagging functions
tagging_functions = [convert_to_openai_function(Tagging)]

In [14]:
#Prompt to tell the language model how to do it
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [15]:
#create model with functions - bind - add functions to model - add function call to foroce it to tag everytime
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

In [16]:
#create tagging chain
tagging_chain = prompt | model_with_functions

In [17]:
tagging_chain.invoke({"input": "I love langchain"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"positive","language":"English"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 75, 'total_tokens': 85}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-13a35c7b-6b0d-4a35-9786-9db20b68c283-0', usage_metadata={'input_tokens': 75, 'output_tokens': 10, 'total_tokens': 85})

In [18]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment":"negative","language":"Italian"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 78, 'total_tokens': 88}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-44fcd720-575a-45dc-9fb1-fdedb5be8757-0', usage_metadata={'input_tokens': 78, 'output_tokens': 10, 'total_tokens': 88})

In [19]:
#Now we want to add output parser to pick and chose the information we want instead of everything
#It takes the AI message, parses out the JSON, and gives us values of arguments
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

In [20]:
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

In [21]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

{'sentiment': 'negative', 'language': 'Italian'}

Extraction - Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [22]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [23]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [24]:
convert_to_openai_function(Information)

Argument people of type typing.List[__main__.Person] from function Information could not be not be converted to a JSON schema.


{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'type': 'object', 'properties': {}, 'required': ['people']}}

In [25]:
#setup extraction models
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [26]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"Joe","age":30},{"name":"Martha","age":null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 95, 'total_tokens': 116}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-86c16b45-5567-4316-a554-02e22cad6b14-0', usage_metadata={'input_tokens': 95, 'output_tokens': 21, 'total_tokens': 116})

In [27]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [28]:
extraction_chain = prompt | extraction_model

In [29]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people":[{"name":"Joe","age":30},{"name":"Martha","age":null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 112, 'total_tokens': 133}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-987266cd-5241-43c3-9288-16c0482c0a19-0', usage_metadata={'input_tokens': 112, 'output_tokens': 21, 'total_tokens': 133})

In [30]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [31]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}

In [32]:
#this will only look for key information 
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [33]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [34]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]

Doing it for real
We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.

In [36]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [37]:
doc = documents[0]

In [38]:
page_content = doc.page_content[:10000]

In [39]:
print(page_content[:1000])







LLM Powered Autonomous Agents | Lil'Log







































Lil'Log






















Posts




Archive




Search




Tags




FAQ




emojisearch.app









      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


 


Table of Contents



Agent System Overview

Component One: Planning

Task Decomposition

Self-Reflection


Component Two: Memory

Types of Memory

Maximum Inner Product Search (MIPS)


Component Three: Tool Use

Case Studies

Scientific Discovery Agent

Generative Agents Simulation

Proof-of-Concept Examples


Challenges

Citation

References





Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general

In [40]:
#We will use the text of the article to tag and extract
#create a model
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [42]:
#setup the chain
overview_tagging_function = [
    convert_pydantic_to_openai_function(Overview)
]
tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [43]:
tagging_chain.invoke({"input": page_content})

{'summary': 'The article discusses building autonomous agents powered by LLM (large language model) as the core controller, focusing on planning, memory, and tool use components.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, task decomposition, self-reflection'}

In [47]:
#Creating an extraction model to get all the papers listed in the article
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [48]:
paper_extraction_function = [
    convert_pydantic_to_openai_function(Info)
]
extraction_model = model.bind(
    functions=paper_extraction_function, 
    function_call={"name":"Info"}
)
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [49]:
extraction_chain.invoke({"input": page_content})

[{'title': 'LLM Powered Autonomous Agents', 'author': 'Lilian Weng'}]

In [50]:
#In the above LLM was confused as the instruction wasnt clear, and it returned only the paper info
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [51]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [52]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': None},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': None},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': None},
 {'title': 'ReAct (Yao et al. 2023)', 'author': None},
 {'title': 'Reflexion (Shinn & Labash 2023)', 'author': None},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)', 'author': None},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)', 'author': None}]

In [58]:
extraction_chain.invoke({"input": "hi"})

[]

In [59]:
#In the above, the article is only considering the first 10k characters, now we want to take page content, split it up, 
#whole article is too big for token window
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [60]:
#create splits
splits = text_splitter.split_text(doc.page_content)

In [61]:
len(splits)

15

In [62]:
#now lets create  chain - pass individual splits into extraction chain, and join all results together
#function to join lists of lsits and flattens them
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [57]:
flatten([[1, 2], [3, 4]])

[1, 2, 3, 4]

In [63]:
#this list is all text 
print(splits[0])

LLM Powered Autonomous Agents | Lil'Log







































Lil'Log






















Posts




Archive




Search




Tags




FAQ




emojisearch.app









      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


 


Table of Contents



Agent System Overview

Component One: Planning

Task Decomposition

Self-Reflection


Component Two: Memory

Types of Memory

Maximum Inner Product Search (MIPS)


Component Three: Tool Use

Case Studies

Scientific Discovery Agent

Generative Agents Simulation

Proof-of-Concept Examples


Challenges

Citation

References





Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general probl

In [70]:
#Some method of preparing splits to pass on to the chain
#Wrapper in langchain that takes function and converts into runnable objects
#necessary when the functions first element in the chain can be effectively piped together
from langchain.schema.runnable import RunnableLambda

In [72]:
#define preprocessing function - that takes the string, splits and constructs a lists of dictionary, 
#where each dictionary is an input corresponding to the split
#page content of document as input
#this will be input to the chain
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [73]:
prep.invoke("hi")

[{'input': 'hi'}]

In [75]:
#list of elements and map this chain over them - list of lists
#no need to wrap it in a runnable lambda
#this is taking the text, splitting into 15 splits, and parallelly extracting (5 splits at a time)
#when all the calls are done, they're then passed to flatten function
chain = prep | extraction_chain.map() | flatten

In [76]:
chain.invoke(doc.page_content)

[{'title': 'AutoGPT', 'author': None},
 {'title': 'GPT-Engineer', 'author': None},
 {'title': 'BabyAGI', 'author': None},
 {'title': 'Chain of thought', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al. 2023'},
 {'title': 'Laskin et al. 2023', 'author': None},
 {'title': 'Miller 1956', 'author': None},
 {'title': 'Duan et al. 2017', 'author': None},
 {'title': 'Google Blog', 'author': None},
 {'title': 'MRKL (Karpas et al. 2022)', 'author': None},
 {'title': 'TALM (Tool Augmented Language Models; Parisi et al. 2022)',
  'author': None},
 {'title': 'Toolformer (Schick et al. 2023)', 'author': None},
 {'title': 'HuggingGPT (Shen et al. 2023)', 'author': None},
 {'title'