In this notebook, I am learning how to use LangChain and Kor to extract structured data from text, and ask the LLM questions about it to see how accurate is the result. I am also using NeMo Guardrails to ensure the LLM only answers on-topic questions.

In [82]:
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

import pandas as pd
import requests
import time
import json
from datetime import datetime

from bs4 import BeautifulSoup
from markdownify import markdownify as md

from langchain.callbacks import get_openai_callback

OPENAI_API_KEY = "..."
#os.environ["OPENAI_API_KEY"] = "..."

In [11]:
# TEST EXAMPLE
llm = ChatOpenAI(
     model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
    openai_api_key=OPENAI_API_KEY
)

table_schema = Object(
    # This what will appear in your output. It's what the fields below will be nested under.
    # It should be the parent of the fields below. Usually it's singular (not plural)
    id="person",

    # Natural language description about your object
    description="Personal information about a person",

    # Fields you'd like to capture from a piece of text about your object.
    attributes=[
        Text(
            id="first_name",
            description="The first name of a person.",
        ),
        Text(
            id="last_name",
            description="The last name of a person.",
        )
    ],

    # Examples help go a long way with telling the LLM what you need
    examples=[
        ("Alice Stoya and Bob Marlin are best friends", [{"first_name": "Alice", "last_name": "Stoya"}, {"first_name": "Bob", "last_name": "Marlin"}])
    ]
)

Next, we create a chain that can extract information

In [12]:
chain = create_extraction_chain(llm, table_schema)

In [13]:
example = """
    My name is Chau.
    My sister's name is Duong.
    My wife's name is Trang.
    My father's name is Thuong.
    My mother's name is Chi.
    All of our last name except for my mother's is Tran.
    My mother doesn't have a last name.
""" # intentionally leaving my mother's last name as blank
# a helper function to print output
def print_output(output):
    print(json.dumps(output,sort_keys=True, indent=3))
table = chain.predict_and_parse(text=(example))["data"]
print_output(table)

{
   "person": [
      {
         "first_name": "Chau",
         "last_name": "Tran"
      },
      {
         "first_name": "Duong",
         "last_name": "Tran"
      },
      {
         "first_name": "Trang",
         "last_name": "Tran"
      },
      {
         "first_name": "Thuong",
         "last_name": "Tran"
      },
      {
         "first_name": "Chi",
         "last_name": ""
      }
   ]
}


Example of working with lists and nested objects

In [133]:
parts = Object(
    id="parts",
    description="A single part of a car",
    attributes=[
        Text(id="part", description="The name of the part"),
        Text(id="type", description="The type of the part"),
        Number(id="quantity", description="How many parts the car currently has")
    ],
    examples=[
        (
            "My cars has 4 off-road tires, 1 steering wheel, and 4 leather seats",
            [
                {"part": "tire", "type": "off-road", "quantity": 4},
                {"part": "steering wheel", "type": None,"quantity": 1},
                {"part": "seat", "type": "leather","quantity": 4}
            ],
        )
    ],
    many=True
)

cars_schema = Object(
    id="car",
    description="Information about a car",
    attributes=[
        Text(
            id="brand",
            description="The make or brand of the car"
        ),
        Text(
            id="name",
            description="The name of the car"
        ),
        Text(
            id="color",
            description="The color of the car"
        ),
        parts
    ],
    examples=[
        (
            "the BMW i8 is white and has an 4 all-season tires, missing the steering wheel, and have 6 leather seats. The Ford Camry is blue and has 4 seats, V4 engine, and 4 doors",
            [
                {"brand": "BMW", "name": "i8", "color": "white", "parts":[{"part": "tire", "type": "all-season", "quantity": 4}, {"part": "seat", "type": "leather", "quantity": 6}]},
                {"brand": "Ford", "name": "Camry", "color": "blue", "parts":[{"part": "seat", "type": None, "quantity": 4}, {"part": "engine", "type": "V4", "quantity": 1}, {"part": "door", "type": None, "quantity": 4}]}
            ],
        ),
        (
            "My dream car is an orange Ford Bronco, with an V8 engine, 4 off-road tires, and 5 seats, and a steering wheel. However, all I can afford right now is a white Ford Camry with V4 engine, missing steering wheel, 3 working tires, and 2 faux-leathered seats. If I earn 1 billion dollars, I will definitely buy an orange McLaren P1 with V12 engine, 2 butterfly doors, and a spoiler",
            [
                {"brand": "Ford", "name": "Bronco", "color": "orange", "parts":[{"part": "engine", "type": "V8", "quantity": 1}, {"part": "seat", "type": None, "quantity": 4}, {"part": "steering wheel", "quantity": 1,"type": None}]},
                {"brand": "Ford", "name": "Camry", "color": "white", "parts":[{"part": "engine", "type": "V4", "quantity": 1}, {"part": "tire", "type": None, "quantity": 3}, {"part": "seat", "type": "faux-leathered", "quantity": 2}]},
                {"brand": "McLaren", "name": "P1", "color": "orange", "parts":[{"part": "engine", "type": "V12", "quantity": 1}, {"part": "door", "type": "butterfly", "quantity": 2}, {"part": "spoiler", "type": None, "quantity": 1}]}
            ],
        )
    ],
    many=True
)

In [178]:
# To do nested objects you need to specify encoder_or_encoder_class="json"
example1 = "In 5 years, I plan to buy Range Rover Discovery, with an V8 engine, 4 off-road tires, 4 surround cameras, and 7 Ebony DuoLeather seats. However, all I can afford right now is a white Toyota Camry with V4 engine, missing steering wheel, 2 working tires, and 2 faux-leather seats. My wife on the other hand wants to buy orange Audi A6 car with V8 engine, 4 heated seats , and a custom-made spoiler"
# Changed the encoder to json
chain = create_extraction_chain(llm, cars_schema, encoder_or_encoder_class="json")
output = chain.predict_and_parse(text=example1)['data']

print_output(output)

{
   "car": [
      {
         "brand": "Toyota",
         "color": "white",
         "name": "Camry",
         "parts": [
            {
               "part": "engine",
               "quantity": 1,
               "type": "V4"
            },
            {
               "part": "tire",
               "quantity": 2,
               "type": null
            },
            {
               "part": "seat",
               "quantity": 2,
               "type": "faux-leather"
            }
         ]
      },
      {
         "brand": "Audi",
         "color": "orange",
         "name": "A6",
         "parts": [
            {
               "part": "engine",
               "quantity": 1,
               "type": "V8"
            },
            {
               "part": "seat",
               "quantity": 4,
               "type": "heated"
            },
            {
               "part": "spoiler",
               "quantity": 1,
               "type": "custom-made"
            }
         ]
     

Since we are dealing with nested object, I want to create two dataframes that acts like relational databases from the ouput object

In [179]:
def convert_dict_to_df(dict_object):
    for key, value in dict_object.items():
        df1 = pd.DataFrame.from_dict(value)
    df1.index.names=['car_id']
    df2 = pd.DataFrame()
    for row in df1.index:
        part = df1.iloc[row, [3]][0]
        row_df = pd.DataFrame.from_dict(part)
        row_df['car_id'] = row
        df2 = pd.concat([df2, row_df],ignore_index=True)

    df2.index.names = ['part_id']
    df1.drop(['parts'], axis=1, inplace=True)
    return df1, df2

car_table, part_table = convert_dict_to_df(output)

In [180]:
car_table

Unnamed: 0_level_0,brand,name,color
car_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Toyota,Camry,white
1,Audi,A6,orange
2,Range Rover,Discovery,


In [181]:
part_table

Unnamed: 0_level_0,part,type,quantity,car_id
part_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,engine,V4,1,0
1,tire,,2,0
2,seat,faux-leather,2,0
3,engine,V8,1,1
4,seat,heated,4,1
5,spoiler,custom-made,1,1
6,engine,V8,1,2
7,tire,off-road,4,2
8,camera,surround,4,2
9,seat,Ebony DuoLeather,7,2
