# Tagging and Extraction Using OpenAI functions

In [1]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
import openai, os
from dotenv import load_dotenv, dotenv_values

In [2]:
load_dotenv("/Users/swang294/Library/CloudStorage/OneDrive-JNJ/Dev/Doc_analyzer/.env")
openai.api_key = os.getenv('OPENAI_API_KEY')
openai_deployment_name = os.getenv('OPENAI_DEPLOYMENT_NAME')
openai_embedding_model_name = os.getenv('OPENAI_EMBEDDING_MODEL_NAME')
openai.api_type = os.getenv('OPENAI_API_TYPE')
openai.api_base = os.getenv('OPENAI_API_BASE')
openai.api_version =  os.getenv('OPENAI_API_VERSION')
MODEL_NAME = os.getenv('MODEL_NAME')

config = dotenv_values("/Users/swang294/Library/CloudStorage/OneDrive-JNJ/Dev/Doc_analyzer/.env")

config['MODEL_NAME']

In [3]:
openai.api_version

'2023-07-01-preview'

## Tagging

Before we used function to extract specific API parameters from a natural langugage input.
 
Here, we show that functions are very flexible. 

We use them to easily tag a piece of text with particular info.


In [4]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")

In [5]:
convert_pydantic_to_openai_function(Tagging)

  warn_deprecated(


{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'type': 'object',
  'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language']}}

In [6]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

model  = ChatOpenAI(
            model=config["MODEL_NAME"],
            engine=config["MODEL_NAME"],
            openai_api_key=config["OPENAI_API_KEY"],
            temperature=0,
            openai_api_base=config["OPENAI_API_BASE"],  streaming=True,
        )
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed."),
    ("human", "{input}"),
])
# We pass function_call to MAKE it call this function
model_with_functions = model.bind(functions=tagging_functions, function_call={"name":"Tagging"})
tagging_chain = prompt | model_with_functions
tagging_chain.invoke({"input": "I love LangChain"})

  warn_deprecated(
                    engine was transferred to model_kwargs.
                    Please confirm that engine is what you intended.


AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': '{\n  "sentiment": "pos",\n  "language": "en"\n}'}})

In [17]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessageChunk(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': '{\n  "sentiment": "neg",\n  "language": "it"\n}'}}, example=False)

We can use an output parser to automatically extract this

In [19]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()
tagging_chain.invoke({"input": "This app is hard to use"})

{'sentiment': 'neg', 'language': 'en'}

## Extraction

Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [7]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")

In [8]:
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [9]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name":"Information"})
extraction_model.invoke("Joe is 30. Joe's mom is Martha")

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Information', 'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 30\n    },\n    {\n      "name": "Martha",\n      "age": 0\n    }\n  ]\n}'}})

Similarly we can use a separate output parser to pluck that "Information" key, since that's the information we really care about

In [10]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
extraction_chain = extraction_model | JsonKeyOutputFunctionsParser(key_name="people")
extraction_chain.invoke("Joe is 30. Joe's mom is Martha")

[{'name': 'Joe', 'age': 30}, {'name': 'Martha', 'age': 0}]

In [11]:
from langchain.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])

In [12]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="people")

In [13]:
extraction_chain.invoke({"input": "Joe is 30. Joe's mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]

In [29]:
extraction_chain.invoke({"input": "Joe is 30. Joe's mom is Martha"})

[{'name': 'Joe', 'age': 30}, {'name': 'Martha'}]

## Doing it for real

We can apply tagging to a larger body of text.

For example, let's load this blog post and extract tag information from a sub-set of the text.

In [14]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

SSLError: HTTPSConnectionPool(host='lilianweng.github.io', port=443): Max retries exceeded with url: /posts/2023-06-23-agent/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))

In [15]:
doc = documents[0]

In [16]:
page_content = doc.page_content[:10000]

In [17]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    langugae: str = Field(description="Provide the languge that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")

In [18]:
overview_tagging_function = [convert_pydantic_to_openai_function(Overview)]
tagging_model = model.bind(functions=overview_tagging_function, function_call={"name":"Overview"})
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()
tagging_chain.invoke({"input": page_content})

{'summary': 'This article discusses the concept of building autonomous agents powered by LLM (large language model) as their core controller. It explores the components of such agents, including planning, memory, and tool use. It also covers various techniques for task decomposition and self-reflection in autonomous agents.',
 'language': 'English',
 'keywords': 'LLM, autonomous agents, planning, memory, tool use, task decomposition, self-reflection'}

Now let's try to extract all papers mentioned in this article

In [19]:
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]

In [20]:
paper_extraction_function = [convert_pydantic_to_openai_function(Info)]
extraction_model = model.bind(functions=paper_extraction_function, function_call={"name":"Info"})

In [21]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [22]:
extraction_chain.invoke({"input": page_content})

[{'title': 'LLM Powered Autonomous Agents', 'author': 'Lilian Weng'}]

In [23]:
template = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

In [24]:
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")

In [25]:
extraction_chain.invoke({"input": page_content})

[{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': 'Wei et al.'},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': 'Liu et al.'},
 {'title': 'ReAct (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'Reflexion (Shinn & Labash 2023)', 'author': 'Shinn & Labash'},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)',
  'author': 'Liu et al.'},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)',
  'author': 'Laskin et al.'}]

In [26]:
extraction_chain.invoke({"input": "hi"})

[]

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

In [28]:
splits = text_splitter.split_text(doc.page_content)

In [29]:
len(splits)

14

In [30]:
from langchain.schema.runnable import RunnableLambda

In [31]:
def flatten(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list

In [32]:
chain = (
    RunnableLambda(lambda x: [{"input": doc} for doc in text_splitter.split_text(x)])
    | extraction_chain.map()
    | flatten
)

In [33]:
chain.invoke(doc.page_content)

[{'title': 'Chain of thought', 'author': 'Wei et al. 2022'},
 {'title': 'Tree of Thoughts', 'author': 'Yao et al. 2023'},
 {'title': 'LLM+P', 'author': 'Liu et al. 2023'},
 {'title': 'ReAct', 'author': 'Yao et al. 2023'},
 {'title': 'Reflexion', 'author': 'Shinn & Labash 2023'},
 {'title': 'Reflexion framework', 'author': 'Shinn & Labash'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al.'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al.'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al. 2023'},
 {'title': 'ED (expert distillation)', 'author': ''},
 {'title': 'RL^2', 'author': 'Duan et al. 2017'},
 {'title': 'LSH: Locality-Sensitive Hashing', 'author': ''},
 {'title': 'ANNOY: Approximate Nearest Neighbors Oh Yeah', 'author': ''},
 {'title': 'HNSW: Hierarchical Navigable Small World', 'author': ''},
 {'title': 'FAISS: Facebook AI Similarity Search', 'author': ''},
 {'title': 'ScaNN: Scalable Nearest Neighbors', 'author': ''},
 {'title': 'MRKL: Modular 