In [None]:
from unsloth import FastLanguageModel
from peft import LoraConfig
from jinja2 import Template
from datasets import load_dataset

import torch

max_seq_length = 150 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/gemma-2b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

### Direct Inference Using prompts

In [88]:
import json
import os
import ollama
import regex as re
from random import choices
from tqdm import tqdm
from yaml import safe_load


In [86]:
def load_jsonl(path):
    with open(path) as input_file:
        lines = list(map(json.loads, input_file))

    return lines

def load_data(root_path):
    files = os.listdir(root_path)
    dataset= {}

    for filename in files:
        filepath = root_path + f'/{filename}'

        dataset[filename] = load_jsonl(filepath)

    return dataset


def parse_json(input_string) -> dict:
    pattern = re.compile(r'\{(?:[^{}]|(?R))*\}')
    return safe_load(pattern.findall(input_string)[0])


In [10]:
dataset = load_data('data/L1')

In [11]:
models = ['llama3.1', 'gemma2:2b', 'gemma2']

test_from = 'attrebute_val'
train_from = 'attrebute_train'
target_from = 'attrebute_test'

train_inputs = train_from + '.data'
train_labels = train_from + '.solution'

test_inputs = test_from + '.data'
test_labels = test_from + '.solution'

target_inputs = target_from + '.data'

test_inputs = dataset[test_inputs]
test_labels = dataset[test_labels]
train_inputs = dataset[train_inputs]
train_labels = dataset[train_labels]
target_inputs = dataset[target_inputs]

In [67]:
def get_random_examples(k=3):
    indexes = choices(range(len(train_inputs)), k=3)

    data = [(train_inputs[idx], train_labels[idx]) for idx in indexes]

    examples = {}

    for i, idx in enumerate(indexes):
        inputs, labels = json.dumps(train_inputs[idx]), json.dumps(train_labels[idx])

        examples[f'example_{i+1}'] = f"""Product:\n{inputs}\nCategories:\n{labels}"""

    return examples

In [131]:
# template_full_prompt = """<|system|>You are a product expert who can tell what categories a product belongs to. You only answer in JSON<|end|>
# <|user|>
# You are a product expert who can tell what categories a product belongs to. You only answer in JSON

# I want you to help me with giving category labels to products based on the product's title, manufacturer, and store name.
# details_Brand, L0_category, L1_category, L2_category, L3_category, L4_category: These are the attribute that has to be predicted.
# Here are 3 examples of what I want you to do:
# 1.
# {example_1}

# 2.
# {example_2}

# 3.
# {example_3}

# Based on the previous examples and instructions, complete the following:
# Product:
# {product}
# Categories:
# <|end|>
# <|assistant|>"""

template_full_prompt = """<start_of_turn>user
You are a product expert who can tell what categories a product belongs to. You only answer in JSON

I want you to help me with giving category labels to products based on the product's title, manufacturer, and store name.
details_Brand, L0_category, L1_category, L2_category, L3_category, L4_category: These are the attribute that has to be predicted.
L0 being the most general category label possible for the product
L1 being a more specific subcategory of L0
L2 being a more specific subcategory of L1
L3 being a more specific subcategory of L2 ('na' if this does not apply)
L4 being a more specific subcategory of L3 ('na' if this does not apply or if L3 is also 'na')

How to reason:
Example Product:
"indoml_id": 275, "title": "Cascade Actionpacs Dishwasher Detergent, Fresh Scent, 110 Count", "store": "Cascade", "details_Manufacturer": "Cascade"

Categories:
"indoml_id": 275, "details_Brand": "Cascade", "L0_category": "Health & Household", "L1_category": "Household Supplies", "L2_category": "Dishwashing", "L3_category": "Dishwasher Detergent", "L4_category": "na"

L0 is Health & Household because a Dishwasher Detergent is used with dishwashers, which is a health & household appliance
L1 is Household Supplies as dishwashers are household appliances
L2 is Dishwashing as Dishwashers wash dishes
L3 is Dishwasher Detergent as the product Dishwasher detergence
L4 is na as it does not apply

Here are 3 more examples of what I want you to do:
1.
{example_1}

2.
{example_2}

3.
{example_3}

Based on the previous examples and instructions, complete the following:
Product:
{product}
Categories:
<end_of_turn>
<start_of_turn>model"""

In [132]:
idx = -45
test_inputs[idx], test_labels[idx]

({'indoml_id': 94990,
  'title': 'VIGO Janus Glass Vessel Bathroom Sink and Waterfall Faucet with Pop Up, Chrome',
  'store': 'VIGO',
  'details_Manufacturer': 'Vigo Industries'},
 {'indoml_id': 94990,
  'details_Brand': 'VIGO',
  'L0_category': 'Tools & Home Improvement',
  'L1_category': 'Kitchen & Bath Fixtures',
  'L2_category': 'Bathroom Fixtures',
  'L3_category': 'Bathroom Sinks',
  'L4_category': 'Vessel Sinks'})

In [133]:
output = ollama.generate(
    model='gemma2:2b', 
    prompt=template_full_prompt.format(
        product=json.dumps(test_inputs[idx]), **get_random_examples()))

In [134]:
parse_json(output['response'])

{'indoml_id': 94990,
 'details_Brand': 'Vigo Industries',
 'L0_category': 'Home & Kitchen',
 'L1_category': 'Bathroom Fixtures',
 'L2_category': 'Sinks',
 'L3_category': 'Vessel Sinks',
 'L4_category': 'Bathroom Sinks'}