### Preliminary approach with base LLMs

#### Use Incontext learning for sentence complition

#### The following LLMs were used here:
- Llama3.1-8b
- Gemma2-8b
- Gemma2-27b
- Gemma2-2b

In [1]:
import os
import ollama
import openai
import json
import random
import dotenv
import regex as re

from tqdm import tqdm
from typing import Union, List, Any
from yaml import safe_load_all, safe_load

dotenv.load_dotenv()

True

In [2]:
def get_client(service="ollama") -> Union[ollama.Client, openai.Client]:
    assert service in ['ollama', 'openai'], "'service' should be either 'ollama' or 'openai'"

    if service == "ollama":
        return ollama.Client(host=os.environ['OLLAMA_ENDPOINT'],)
    
    if service == "openai":
        return openai.Client(api_key=os.environ['OPENAI_API_KEY'])
    
def call_llm(messages: list[dict],
             model:str='llama3.1',
             temperature:float=0.2,
             top_k:Union[None, int]=None) -> str:

    if 'gpt' in model:
        # call openai
        response = get_client('openai').chat.completions.create(
            messages=messages,
            model=model,
            temperature=temperature,
        ).choices[0].message.content
    
    else:
        response = get_client('ollama').chat(
            model=model,
            messages=messages,
            options={
                "top_k": top_k,
                "temperature": temperature
            }
        )['message']['content']

    return response

def parse_json(input_string:str) -> dict:
    # extract JSON patterned string
    # to understand this regex: https://regex101.com/r/to8x5X/1
    pattern = re.compile(r'\{(?:[^{}]|(?R))*\}')

    # load JSON using python's JSON utility
    return safe_load(pattern.findall(input_string)[0])

In [3]:
def load_jsonl(path):
    with open(path) as input_file:
        lines = list(map(json.loads, input_file))

    return lines

def load_data(root_path):
    files = os.listdir(root_path)
    dataset= {}

    for filename in files:
        filepath = root_path + f'/{filename}'

        dataset[filename] = load_jsonl(filepath)

    return dataset

In [4]:
dataset = load_data('data/L1')

In [5]:
def get_random_example(inputs, labels, n=3):
    idx = random.choices(list(range(len(inputs))), k=n)

    random_inputs = [inputs[i] for i in idx]
    random_labels = [labels[i] for i in idx]

    return random_inputs, random_labels

In [6]:
models = ['llama3.1', 'gemma2:2b', 'gemma2']

test_from = 'attrebute_val'
train_from = 'attrebute_train'

train_inputs = train_from + '.data'
train_labels = train_from + '.solution'

test_inputs = test_from + '.data'
test_labels = test_from + '.solution'

test_inputs = dataset[test_inputs]
test_labels = dataset[test_labels]
train_inputs = dataset[train_inputs]
train_labels = dataset[train_labels]

In [7]:
system_prompt = """You are an helpful assistant who only answers in JSON and nothing else.
You do not answer in Markdown, and not use " or ' within strings"""

user_prompt = """Task: Attribute-Value Prediction From E-Commerce Product Descriptions
Example 1:
{example_1}

Example 2:
{example_2}

Example 3:
{example_3}

Based on the above examples:

I want you to help me find attribute-value from the following description:
Product data:
Title: {title}
Store: {store}
Manufacturer: {details_Manufacturer}
Attribute-values:
"""

examples_prompt = """Product data:
Title: {title}
Store: {store}
Manufacturer: {details_Manufacturer}
Attribute-values:
{attribute_values}"""

In [8]:
def prepare_prompts(question):
    reference_inputs, reference_labels = get_random_example(train_inputs, train_labels)

    examples = {}
    select_l = ['details_Brand', 'L0_category', 'L1_category', 'L2_category', 'L3_category', 'L4_category']

    for i, (inputs, labels) in enumerate(zip(reference_inputs, reference_labels)):
        labels = str({item: labels[item] for item in select_l})

        examples[f'example_{i+1}'] = examples_prompt.format(
            attribute_values=labels, **inputs
        )

    return user_prompt.format(**examples, **question)

In [9]:
test_x = test_inputs[0]
test_y = test_labels[0]

response=call_llm(
    messages=[
        dict(role='system', content=system_prompt),
        dict(role='user', content=prepare_prompts(test_x))
    ],
    model='gemma2',
    temperature=0
)

In [10]:
import pandas as pd

In [18]:
predictions = []

for test_x, test_y in tqdm(zip(test_inputs, test_labels), total=len(test_inputs)):
    response=call_llm(
        messages=[
            dict(role='system', content=system_prompt),
            dict(role='user', content=prepare_prompts(test_x))
        ],
        model='gemma2',
        temperature=0
    )

    predictions.append(parse_json(response))

  0%|          | 34/95035 [00:52<40:26:29,  1.53s/it]


KeyboardInterrupt: 

In [10]:
test_x  

{'indoml_id': 0,
 'title': 'Pendleton, Eco-Wise Washable Wool Blanket, Black Watch, King',
 'store': 'Pendleton',
 'details_Manufacturer': 'Pendleton Woolen Mills'}

In [11]:
test_y

{'indoml_id': 0,
 'details_Brand': 'Pendleton',
 'L0_category': 'Home & Kitchen',
 'L1_category': 'Bedding',
 'L2_category': 'Blankets & Throws',
 'L3_category': 'Bed Blankets',
 'L4_category': 'na'}

In [12]:
response

"```json\n{'details_Brand': 'Pendleton', 'L0_category': 'Home & Garden', 'L1_category': 'Home', 'L2_category': 'Bedding', 'L3_category': 'Blankets', 'L4_category': 'Wool Blankets'}\n```"

In [13]:
parse_json(response)

{'details_Brand': 'Pendleton',
 'L0_category': 'Home & Garden',
 'L1_category': 'Home',
 'L2_category': 'Bedding',
 'L3_category': 'Blankets',
 'L4_category': 'Wool Blankets'}