In [1]:
# autoreload your package
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print("Loading model and tokenizer...")
model_name = "databricks/dolly-v2-3b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_cache=True,
    torch_dtype=torch.float16,
    attn_implementation="eager",
).to("cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_cache=True)
print("Loaded model and tokenizer")

Loading model and tokenizer...
Loaded model and tokenizer


# Continue

In [3]:
from prob_jsonformer.format import highlight_values
from prob_jsonformer.main import Jsonformer

ecomm = {
    "type": "object",
    "properties": {
        "store": {
            "type": "object",
            "properties": {
                "name": {"type": "string"},
                "location": {"type": "string"},
                "p_enum": {
                    "type": "p_enum",
                    "enum": ["ski", "snowboard", "walk", "pretend"],
                },
                "inventory": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "productId": {"type": "string"},
                            "name": {"type": "string"},
                            "description": {"type": "string"},
                            "category": {"type": "string"},
                            "price": {"type": "number"},
                            "inStock": {"type": "boolean"},
                            "rating": {"type": "number"},
                            "images": {"type": "array", "items": {"type": "string"}},
                        },
                    },
                },
            },
        }
    },
}


builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=ecomm,
    prompt="write a description about mike's ski shop which sells premium skis and snowboards",
    max_string_token_length=20,
)

print("Generating...")
output = builder()

highlight_values(output)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating...


KeyError: 'values'

In [None]:
car = {
    "type": "object",
    "properties": {
        "make": {"type": "string"},
        "model": {"type": "p_enum", "enum": ["Mazda", "Kea"]},
        "new": {"type": "p_enum", "enum": ["true", "false"]},
        "rating": {"type": "p_enum", "enum": ["1", "2", "3", "4"]},
        "year": {"type": "number"},
        "colors_available": {
            "type": "array",
            "items": {"type": "string"},
        },
    },
}

builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=car,
    prompt="generate an example car",
)

print("Generating...")
output = builder()

highlight_values(output)

In [None]:
complex_car = {
    "type": "object",
    "properties": {
        "car": {
            "type": "object",
            "properties": {
                "make": {"type": "string"},
                "model": {"type": "string"},
                "year": {"type": "number"},
                "colors": {
                    "type": "p_enum",
                    "enum": ["red", "green", "blue", "black", "white"],
                },
                "as_new": {"type": "p_enum", "enum": ["true", "false"]},
                "rating": {"type": "p_enum", "enum": ["1", "2", "3", "4"]},
                "features": {
                    "type": "object",
                    "properties": {
                        "audio": {
                            "type": "object",
                            "properties": {
                                "brand": {"type": "string"},
                                "speakers": {"type": "number"},
                                "hasBluetooth": {"type": "boolean"},
                            },
                        },
                        "safety": {
                            "type": "object",
                            "properties": {
                                "airbags": {"type": "number"},
                                "parkingSensors": {"type": "boolean"},
                                "laneAssist": {"type": "boolean"},
                            },
                        },
                        "performance": {
                            "type": "object",
                            "properties": {
                                "engine": {"type": "string"},
                                "horsepower": {"type": "number"},
                                "topSpeed": {"type": "number"},
                            },
                        },
                    },
                },
            },
        },
        "owner": {
            "type": "object",
            "properties": {
                "firstName": {"type": "string"},
                "lastName": {"type": "string"},
                "age": {"type": "number"},
            },
        },
    },
}
builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=complex_car,
    prompt="generate an example Rolls Royce Phantom",
)

print("Generating...")
output = builder()

highlight_values(output)

## Readme example

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "databricks/dolly-v2-3b"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from prob_jsonformer import Jsonformer

json_schema = {
    "type": "object",
    "properties": {
        # we can return the probability of each choice, even if they are multiple tokens
        "age_probs": {"type": "p_enum", "values": [str(s) for s in range(10, 20)]},
        # we can return the probabilistic weighted mean of a range
        "age_wmean": {"type": "p_integer", "minimum": 10, "maximum": 20},
        # the prob of true and false
        "is_student_probs": {"type": "p_enum", "values": ["true", "false"]},
        "is_student": {"type": "boolean"},
        # we've merged patches for enum, integer, null, union - currently mising from jsonformer
        "name": {"type": "string", "maxLength": 4},
        "age": {"type": "integer"},
        "unit_time": {"type": "number"},
        "courses": {"type": "array", "items": {"type": "string"}},
        "trim": {"type": ["string", "null"]},
        "color": {
            "type": "enum",
            "values": ["red", "green", "blue", "brown", "white", "black"],
        },
    },
}
prompt = "Generate a young person's information based on the following schema:"
jsonformer = Jsonformer(model, tokenizer, json_schema, prompt)
generated_data = jsonformer()

generated_data

{'age_probs': [{'prob': 0.62353515625, 'choice': '10'},
  {'prob': 0.349609375, 'choice': '12'},
  {'prob': 0.01123809814453125, 'choice': '11'},
  {'prob': 0.00760650634765625, 'choice': '16'},
  {'prob': 0.0025482177734375, 'choice': '13'},
  {'prob': 0.0025081634521484375, 'choice': '15'},
  {'prob': 0.0018062591552734375, 'choice': '14'},
  {'prob': 0.00104522705078125, 'choice': '18'},
  {'prob': 0.00011551380157470703, 'choice': '17'},
  {'prob': 5.042552947998047e-05, 'choice': '19'}],
 'age_wmean': 15.544570922851562,
 'is_student_probs': [{'prob': 0.962890625, 'choice': 'true'},
  {'prob': 0.037322998046875, 'choice': 'false'}],
 'is_student': False,
 'name': 'John',
 'age': 17,
 'unit_time': 0.5,
 'courses': ['C++'],
 'trim': None,
 'color': 'green'}