# Tagging and Extraction Using OpenAI functions

In [1]:
from Models import OllamaLLM, GroqChatLLM
llm = GroqChatLLM()

In [2]:
%%script true
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:
import os
from typing import List
from pydantic import BaseModel, Field
from langchain_core.utils.function_calling import convert_to_openai_function

In [4]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [5]:
convert_to_openai_function(Tagging)

{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language'],
  'type': 'object'}}

In [6]:
from langchain.prompts import ChatPromptTemplate
#from langchain.chat_models import ChatOpenAI

In [7]:
model = llm #ChatOpenAI(temperature=0)

In [8]:
tagging_functions = [convert_to_openai_function(Tagging)]

In [9]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

In [10]:
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}   # forces function call
)

In [11]:
tagging_chain = prompt | model_with_functions

In [12]:
tagging_chain.invoke({"input": "I love langchain"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment": "pos", "language": "en"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 294, 'total_tokens': 311, 'completion_time': 0.068, 'prompt_time': 0.078581052, 'queue_time': 0.004961501000000007, 'total_time': 0.146581052}, 'model_name': 'llama-3.1-70b-versatile', 'system_fingerprint': 'fp_b6828be2c9', 'finish_reason': 'function_call', 'logprobs': None}, id='run-2efa6334-26fd-4e9c-94cc-2844ad1745c6-0', usage_metadata={'input_tokens': 294, 'output_tokens': 17, 'total_tokens': 311})

In [13]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"sentiment": "neg", "language": "it"}', 'name': 'Tagging'}}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 297, 'total_tokens': 314, 'completion_time': 0.068, 'prompt_time': 0.0787968, 'queue_time': 0.004884349999999996, 'total_time': 0.1467968}, 'model_name': 'llama-3.1-70b-versatile', 'system_fingerprint': 'fp_b6828be2c9', 'finish_reason': 'function_call', 'logprobs': None}, id='run-81794ef4-5ff4-4560-80b9-26ebd5e1600f-0', usage_metadata={'input_tokens': 297, 'output_tokens': 17, 'total_tokens': 314})

In [14]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

In [15]:
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()

In [16]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

{'sentiment': 'neg', 'language': 'it'}

## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [17]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [18]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [19]:
convert_to_openai_function(Information)

{'name': 'Information',
 'description': 'Information to extract.',
 'parameters': {'properties': {'people': {'description': 'List of info about people',
    'items': {'description': 'Information about a person.',
     'properties': {'name': {'description': "person's name", 'type': 'string'},
      'age': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
       'description': "person's age"}},
     'required': ['name', 'age'],
     'type': 'object'},
    'type': 'array'}},
  'required': ['people'],
  'type': 'object'}}

In [20]:
extraction_functions = [convert_to_openai_function(Information)]
extraction_model = model.bind(
    functions=extraction_functions, 
    function_call={"name": "Information"}
    )

In [21]:
extraction_model.invoke("Joe is 30, his mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people": [{"name": "Joe", "age": 30}, {"name": "Martha", "age": null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 31, 'prompt_tokens': 292, 'total_tokens': 323, 'completion_time': 0.124, 'prompt_time': 0.073016449, 'queue_time': 0.0052133330000000005, 'total_time': 0.197016449}, 'model_name': 'llama-3.1-70b-versatile', 'system_fingerprint': 'fp_5c5d1b5cfb', 'finish_reason': 'function_call', 'logprobs': None}, id='run-49fa1ca8-fd22-47c7-9ca1-9e2b74f46e51-0', usage_metadata={'input_tokens': 292, 'output_tokens': 31, 'total_tokens': 323})

In [22]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info. Leave blank for no info."),
    ("human", "{input}")
])

In [23]:
extraction_chain = prompt | extraction_model

In [24]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{"people": [{"name": "Joe", "age": 30}, {"name": "Martha", "age": null}]}', 'name': 'Information'}}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 315, 'total_tokens': 347, 'completion_time': 0.128817995, 'prompt_time': 0.07905088, 'queue_time': 0.005068171999999996, 'total_time': 0.207868875}, 'model_name': 'llama-3.1-70b-versatile', 'system_fingerprint': 'fp_b6828be2c9', 'finish_reason': 'function_call', 'logprobs': None}, id='run-118c68bb-0385-4e56-946a-c3c065d71d1e-0', usage_metadata={'input_tokens': 315, 'output_tokens': 32, 'total_tokens': 347})

In [25]:
extraction_chain = prompt | extraction_model | JsonOutputFunctionsParser()

In [26]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

{'people': [{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]}

In [27]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

In [28]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [29]:
extraction_chain.invoke({"input": "Joe is 30, his mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': None}]

## Doing it for real

We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.

In [None]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [34]:
doc = documents[0]

In [35]:
page_content = doc.page_content[:10000]

In [42]:
print(page_content[:1000].replace('\n\n', ''))

LLM Powered Autonomous Agents | Lil'LogLil'Log
Posts
Archive
Search
Tags
FAQ
emojisearch.app      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng
 
Table of ContentsAgent System OverviewComponent One: PlanningTask DecompositionSelf-Reflection
Component Two: MemoryTypes of MemoryMaximum Inner Product Search (MIPS)
Component Three: Tool UseCase StudiesScientific Discovery AgentGenerative Agents SimulationProof-of-Concept Examples
ChallengesCitationReferencesBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general


In [43]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [44]:
overview_tagging_function = [
    convert_to_openai_function(Overview)
]
tagging_model = model.bind(
    functions=overview_tagging_function,
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()

In [45]:
tagging_chain.invoke({"input": page_content})

{'summary': 'LLM-powered autonomous agents have the potential to be powerful general problem solvers. They consist of three key components: planning, memory, and tool use. Planning involves breaking down large tasks into smaller subgoals and self-reflection to improve past actions. Memory includes short-term and long-term memory, with long-term memory allowing the agent to retain and recall information over extended periods. Tool use enables the agent to call external APIs for extra information. Various techniques and frameworks, such as Chain of Thought, Tree of Thoughts, ReAct, Reflexion, and Chain of Hindsight, have been developed to improve the planning and self-reflection capabilities of LLM-powered autonomous agents.',
 'language': 'English',
 'keywords': 'LLM-powered autonomous agents, planning, memory, tool use, Chain of Thought, Tree of Thoughts, ReAct, Reflexion, Chain of Hindsight'}

In [46]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [55]:
paper_extraction_function = [
    convert_to_openai_function(Info)
]
extraction_model = model.bind(
    functions=paper_extraction_function, 
    function_call={"name":"Info"}
)
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [56]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of Thought', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al. 2023'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al. 2023'}]

In [57]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [58]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [59]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT; Wei et al. 2022)'},
 {'title': 'Tree of Thoughts (Yao et al. 2023)'},
 {'title': 'LLM+P (Liu et al. 2023)'},
 {'title': 'ReAct (Yao et al. 2023)'},
 {'title': 'Reflexion (Shinn & Labash 2023)'},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)'},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)'},
 {'title': 'RL^2 (Duan et al. 2017)'},
 {'title': 'ED (expert distillation)'},
 {'title': 'UCB'}]

In [60]:
extraction_chain.invoke({"input": "hi, how are you"})

[]

In [61]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [62]:
splits = text_splitter.split_text(doc.page_content)

In [63]:
len(splits)

15

In [64]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [65]:
flatten([[1, 2], [3, 4]])

[1, 2, 3, 4]

In [68]:
print(splits[0].replace('\n\n', ''))

LLM Powered Autonomous Agents | Lil'LogLil'Log
Posts
Archive
Search
Tags
FAQ
emojisearch.app      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng
 
Table of ContentsAgent System OverviewComponent One: PlanningTask DecompositionSelf-Reflection
Component Two: MemoryTypes of MemoryMaximum Inner Product Search (MIPS)
Component Three: Tool UseCase StudiesScientific Discovery AgentGenerative Agents SimulationProof-of-Concept Examples
ChallengesCitationReferencesBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by severa

In [69]:
from langchain.schema.runnable import RunnableLambda

In [70]:
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)

In [71]:
prep.invoke("hi")

[{'input': 'hi'}]

In [78]:
chain = prep | extraction_chain | flatten

In [79]:
chain.invoke(doc.page_content)

['title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author',
 'title',
 'author']