In this notebook, I am learning how to use LangChain and Kor to extract structured data from text, and ask the LLM questions about it to see how accurate is the result. I am also using NeMo Guardrails to ensure the LLM only answers on-topic questions.

In [12]:
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

import pandas as pd
import requests
import time
import json
from datetime import datetime

from bs4 import BeautifulSoup
from markdownify import markdownify as md

from langchain.callbacks import get_openai_callback

OPENAI_API_KEY = "..."
#os.environ["OPENAI_API_KEY"] = "..."

In [19]:
# TEST EXAMPLE
llm = ChatOpenAI(
     model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
    openai_api_key=OPENAI_API_KEY
)

table_schema = Object(
    # This what will appear in your output. It's what the fields below will be nested under.
    # It should be the parent of the fields below. Usually it's singular (not plural)
    id="person",

    # Natural language description about your object
    description="Personal information about a person",

    # Fields you'd like to capture from a piece of text about your object.
    attributes=[
        Text(
            id="first_name",
            description="The first name of a person.",
        ),
        Text(
            id="last_name",
            description="The last name of a person.",
        )
    ],

    # Examples help go a long way with telling the LLM what you need
    examples=[
        ("Alice Stoya and Bob Marlin are best friends", [{"first_name": "Alice", "last_name": "Stoya"}, {"first_name": "Bob", "last_name": "Marlin"}])
    ]
)

Next, we create a chain that can extract information

In [20]:
chain = create_extraction_chain(llm, table_schema)

In [24]:
example = """
    My name is Chau.
    My sister's name is Duong.
    My wife's name is Trang.
    My father's name is Thuong.
    My mother's name is Chi.
    All of our last name except for my mother's is Tran.
    My mother doesn't have a last name.
""" # intentionally leaving my mother's last name as blank
# a helper function to print output
def print_output(output):
    print(json.dumps(output,sort_keys=True, indent=3))
table = chain.predict_and_parse(text=(example))["data"]
print_output(table)

{
   "person": [
      {
         "first_name": "Chau",
         "last_name": "Tran"
      },
      {
         "first_name": "Duong",
         "last_name": "Tran"
      },
      {
         "first_name": "Trang",
         "last_name": "Tran"
      },
      {
         "first_name": "Thuong",
         "last_name": "Tran"
      },
      {
         "first_name": "Chi",
         "last_name": ""
      }
   ]
}


Example of working with lists and nested objects

In [25]:
parts = Object(
    id="parts",
    description="A single part of a car",
    attributes=[
        Text(id="part", description="The name of the part"),
        Number(id="quantity", description="How many parts the car currently has")
    ],
    examples=[
        (
            "My cars has 4 tires, 1 steering wheel, and 4 seats",
            [
                {"part": "tire", "quantity": 4},
                {"part": "steering wheel", "quantity": 1},
                {"part": "seat", "quantity": 4}
            ],
        )
    ]
)

cars_schema = Object(
    id="car",
    description="Information about a car",
    examples=[
        (
            "the BMW is white and has an 4 tires, missing the steering wheel, and have 3 seats",
            [
                {"brand": "BMW", "color": "white", "parts" : {"tire": 4, "steering wheel": 0, "seat": 3}}
            ],
        )
    ],
    attributes=[
        Text(
            id="brand",
            description="The make or brand of the car"
        ),
        Text(
            id="color",
            description="The color of the car"
        ),
        parts
    ]
)

In [27]:
# To do nested objects you need to specify encoder_or_encoder_class="json"
example1 = "My dream car is an orange Ford Bronco, with V8 engine, 4 tires, and 5 seats, and a steering wheel. However, all I can afford right now is a Ford Camry with V4 engine, missing steering wheel, 3 working tires, and 2 seats"
# Changed the encoder to json
chain = create_extraction_chain(llm, cars_schema, encoder_or_encoder_class="json")
output = chain.predict_and_parse(text=text)['data']

print_output(output) # it looks like the table won't store two different cars

{
   "car": {
      "brand": "Ford Bronco",
      "color": "orange",
      "parts": {
         "engine": "V8",
         "seat": 5,
         "steering wheel": 1,
         "tire": 4
      }
   }
}
