In [1]:
%pip install --upgrade openai --quiet
%pip install --upgrade nlpia2-wikipedia --quiet
%pip install --upgrade tenacity --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
from dotenv import load_dotenv
load_dotenv('../.vscode/.env')

True

Named Entity Recognition for Ingredients

In [2]:
import json
import logging
import os

import openai
import wikipedia

from typing import Optional
from IPython.display import display, Markdown
from tenacity import retry, wait_random_exponential, stop_after_attempt

logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s')

OPENAI_MODEL = 'gpt-3.5-turbo-0613'

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))

In [3]:
labels = [
    "person",      # people, including fictional characters
    "fac",         # buildings, airports, highways, bridges
    "org",         # organizations, companies, agencies, institutions
    "gpe",         # geopolitical entities like countries, cities, states
    "loc",         # non-gpe locations
    "product",     # vehicles, foods, appareal, appliances, software, toys 
    "event",       # named sports, scientific milestones, historical events
    "work_of_art", # titles of books, songs, movies
    "law",         # named laws, acts, or legislations
    "language",    # any named language
    "date",        # absolute or relative dates or periods
    "time",        # time units smaller than a day
    "percent",     # percentage (e.g., "twenty percent", "18%")
    "money",       # monetary values, including unit
    "quantity",    # measurements, e.g., weight or distance
]

In [18]:
def system_message(labels):
    return f"""
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) types are exclusively: ({", ".join(labels)})."""

In [19]:
def assisstant_message():
    return f"""
EXAMPLE:
    Text: '1 pie crust (store-bought or homemade), 6 cups thinly sliced apples (such as Granny Smith or a combination of tart and sweet apples), 3/4 cup granulated sugar, 2 tbsp all-purpose flour, 1 tsp ground cinnamon, 1/4 tsp ground nutmeg, 1/4 tsp salt, 1 tsp vanilla extract, 1/2 cup unsalted butter (cold and cut into small pieces), 3/4 cup all-purpose flour (for crumb topping), 1/2 cup brown sugar (packed, for crumb topping), 1/4 tsp baking powder (for crumb topping), 1/4 tsp salt (for crumb topping), Optional: Vanilla ice cream or whipped cream for serving'
    {{
        "product": ["pie crust","apples", "granulated sugar", "all-purpose flour", "ground cinnamon", "ground nutmeg", "salt", "vanilla extract", "unsalted water","brown sugar","baking powder", "vanilla ice cream", "whipped cream"],
        "quantity": ["1", "6 cups", "3/4 cups", "2 tbsp","1 tsp","1/4 tsp", "1/4 tsp", "1 tsp", "1/2 cup", "3/4 cup", "1/2 cup", "1/4 tsp", "1/4 tsp"]
    }}
--"""

In [20]:
def user_message(text):
    return f"""
TASK:
    Text: {text}
"""

In [21]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def find_link(entity: str) -> Optional[str]:
    """
    Finds a Wikipedia link for a given entity.
    """
    try:
        titles = wikipedia.search(entity)
        if titles:
            # naively consider the first result as the best
            page = wikipedia.page(titles[0])
            return page.url
    except (wikipedia.exceptions.WikipediaException) as ex:
        logging.error(f'Error occurred while searching for Wikipedia link for entity {entity}: {str(ex)}')

    return None

In [22]:
def find_all_links(label_entities:dict) -> dict:
    """ 
    Finds all Wikipedia links for the dictionary entities in the whitelist label list.
    """
    whitelist = ['event', 'gpe', 'org', 'person', 'product', 'work_of_art',"quantity"]
    
    return {e: find_link(e) for label, entities in label_entities.items() 
                            for e in entities
                            if label in whitelist}

In [28]:
def enrich_entities(text: str, label_entities: dict) -> str:
    """
    Enriches text with knowledge base links.
    """
    # entity_link_dict = find_all_links(label_entities)
    # logging.info(f"entity_link_dict: {entity_link_dict}")
    
    # for entity, link in entity_link_dict.items():
    #     text = text.replace(entity, f"[{entity}]({link})")

    return label_entities['quantity']

In [29]:
def generate_functions(labels: dict) -> list:
    return [
        {   
            "type": "function",
            "function": {
                "name": "enrich_entities",
                "description": "Enrich Text with Knowledge Base Links",
                "parameters": {
                    "type": "object",
                        "properties": {
                            "r'^(?:' + '|'.join({labels}) + ')$'": 
                            {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            }
                        },
                        "additionalProperties": False
                },
            }
        }
    ]

In [30]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def run_openai_task(labels, text):
    messages = [
          {"role": "system", "content": system_message(labels=labels)},
          {"role": "assistant", "content": assisstant_message()},
          {"role": "user", "content": user_message(text=text)}
      ]

    # TODO: functions and function_call are deprecated, need to be updated
    # See: https://platform.openai.com/docs/api-reference/chat/create#chat-create-tools
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo-0613",
        messages=messages,
        tools=generate_functions(labels),
        tool_choice={"type": "function", "function" : {"name": "enrich_entities"}}, 
        temperature=0,
        frequency_penalty=0,
        presence_penalty=0,
    )

    response_message = response.choices[0].message
    
    available_functions = {"enrich_entities": enrich_entities}  
    function_name = response_message.tool_calls[0].function.name
    
    function_to_call = available_functions[function_name]
    logging.info(f"function_to_call: {function_to_call}")

    function_args = json.loads(response_message.tool_calls[0].function.arguments)
    logging.info(f"function_args: {function_args}")

    function_response = function_to_call(text, function_args)

    return {"model_response": response, 
            "function_response": function_response}

In [31]:
text = """6 cups thinly sliced apples, 1 tablespoon lemon juice (Optional), ¾ cup white sugar, 2 tablespoons all-purpose flour, ½ teaspoon ground cinnamon, ⅛ teaspoon ground nutmeg, ½ cup raisins (Optional), ½ cup chopped walnuts (Optional), 1 (9 inch) pie shell, ½ cup all-purpose flour, ½ cup packed brown sugar, 3 tablespoons butter"""
result = run_openai_task(labels, text)

 2024-04-07 12:38:09,688 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 2024-04-07 12:38:09,692 - INFO - function_to_call: <function enrich_entities at 0x121198c20>
 2024-04-07 12:38:09,692 - INFO - function_args: {'quantity': ['6 cups', '1 tablespoon', '¾ cup', '2 tablespoons', '½ teaspoon', '⅛ teaspoon', '½ cup', '1 (9 inch)', '½ cup', '½ cup', '3 tablespoons'], 'product': ['apples', 'lemon juice', 'white sugar', 'all-purpose flour', 'ground cinnamon', 'ground nutmeg', 'raisins', 'chopped walnuts', 'pie shell', 'all-purpose flour', 'packed brown sugar', 'butter']}


In [13]:
display(Markdown(f"""**Text:** {text}   
                     **Enriched_Text:** {result['function_response']}"""))

**Text:** 6 cups thinly sliced apples, 1 tablespoon lemon juice (Optional), ¾ cup white sugar, 2 tablespoons all-purpose flour, ½ teaspoon ground cinnamon, ⅛ teaspoon ground nutmeg, ½ cup raisins (Optional), ½ cup chopped walnuts (Optional), 1 (9 inch) pie shell, ½ cup all-purpose flour, ½ cup packed brown sugar, 3 tablespoons butter   
                     **Enriched_Text:** 6 cups thinly sliced [apples](https://en.wikipedia.org/wiki/Apple), 1 tablespoon [lemon juice](https://en.wikipedia.org/wiki/Lemon) (Optional), ¾ cup [white sugar](https://en.wikipedia.org/wiki/White_sugar), 2 tablespoons [all-purpose flour](https://en.wikipedia.org/wiki/Flour), ½ teaspoon [ground cinnamon](https://en.wikipedia.org/wiki/Cinnamon_challenge), ⅛ teaspoon [ground nutmeg](https://en.wikipedia.org/wiki/Nutmeg), ½ cup [raisins](https://en.wikipedia.org/wiki/Raisin) (Optional), ½ cup [chopped walnuts](https://en.wikipedia.org/wiki/Kharcho) (Optional), 1 (9 inch) [pie shell](https://en.wikipedia.org/wiki/Scotch_pie), ½ cup [all-purpose flour](https://en.wikipedia.org/wiki/Flour), ½ cup [packed brown sugar](https://en.wikipedia.org/wiki/Sugar), 3 tablespoons [butter](https://en.wikipedia.org/wiki/Butter)

In [14]:
result

{'model_response': ChatCompletion(id='chatcmpl-8uvdXw4SumgkSVdDwzcyWOSLuh6wU', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_DhsUzxFOrsrxuBEdJVlgYDmF', function=Function(arguments='{\n    "product": ["apples", "lemon juice", "white sugar", "all-purpose flour", "ground cinnamon", "ground nutmeg", "raisins", "chopped walnuts", "pie shell", "packed brown sugar", "butter"]\n}', name='enrich_entities'), type='function')]))], created=1708578615, model='gpt-3.5-turbo-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=56, prompt_tokens=466, total_tokens=522)),
 'function_response': '6 cups thinly sliced [apples](https://en.wikipedia.org/wiki/Apple), 1 tablespoon [lemon juice](https://en.wikipedia.org/wiki/Lemon) (Optional), ¾ cup [white sugar](https://en.wikipedia.org/wiki/White_sugar), 2 tablespoons 

In [15]:
arr1 = ['apples', 'lemon juice', 'white sugar', 'all-purpose flour', 'ground cinnamon', 'ground nutmeg', 'raisins', 'chopped walnuts', 'pie shell', 'packed brown sugar', 'butter']
arr2 = ["pie crust", "apples", "granulated sugar", "all-purpose flour", "ground cinnamon", "ground nutmeg", "salt", "vanilla extract", "unsalted water", "brown sugar", "baking powder", "vanilla ice cream", "whipped cream"]

def count_partial_matches(arr1, arr2):
    intersection = []
    for item1 in arr1:
        for item2 in arr2:
            # Check if item1 is part of item2 or item2 is part of item1
            if item1 in item2 or item2 in item1:
                # To avoid duplicates in the intersection list
                matched_item = next((x for x in intersection if item1 in x or item2 in x), None)
                if not matched_item:
                    intersection.append(item1 if len(item1) < len(item2) else item2)
    return len(intersection)

match_count = count_partial_matches(arr1, arr2)
print(match_count)

5
