In [1]:
# autoreload your package
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print("Loading model and tokenizer...")
model_name = "databricks/dolly-v2-3b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_cache=True,
    torch_dtype=torch.float16,
    attn_implementation='eager',
).to("cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_cache=True)
print("Loaded model and tokenizer")

  from .autonotebook import tqdm as notebook_tqdm


Loading model and tokenizer...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded model and tokenizer


# Continue

In [4]:
from prob_jsonformer.format import highlight_values
from prob_jsonformer.main import Jsonformer

ecomm = {
    "type": "object",
    "properties": {
        "store": {
            "type": "object",
            "properties": {
                "name": {"type": "string"},
                "location": {"type": "string"},
                "choice_probs": {"type": "choice_probs", "enum": ["ski", "snowboard", "walk", "pretend"]},
                "inventory": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "productId": {"type": "string"},
                            "name": {"type": "string"},
                            "description": {"type": "string"},
                            "category": {"type": "string"},
                            "price": {"type": "number"},
                            "inStock": {"type": "boolean"},
                            "rating": {"type": "number"},
                            "images": {"type": "array", "items": {"type": "string"}},
                        },
                    },
                },
            },
        }
    },
}


builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=ecomm,
    prompt="write a description about mike's ski shop which sells premium skis and snowboards",
    max_string_token_length=20,
)

print("Generating...")
output = builder()

highlight_values(output)

Generating...
{
  store: {
    name: [32m"Mike's Ski Shop"[0m,
    location: [32m"Somewhere"[0m,
    choice_probs: [
      {
        prob: [32m0.01739501953125[0m,
        choice: [32m"pretend"[0m
      },
      {
        prob: [32m0.002094268798828125[0m,
        choice: [32m"snowboard"[0m
      },
      {
        prob: [32m0.0007467269897460938[0m,
        choice: [32m"walk"[0m
      },
      {
        prob: [32m0.97998046875[0m,
        choice: [32m"ski"[0m
      }
    ],
    inventory: [
      {
        productId: [32m"1"[0m,
        name: [32m"Snowboard X-15"[0m,
        description: [32m"Snowboard for all levels"[0m,
        category: [32m"Snowboards"[0m,
        price: [32m20.0[0m,
        inStock: [32mTrue[0m,
        rating: [32m5.0[0m,
        images: [
          [32m"https://s3.amazonaws.com/mikesskisport/images/Snow"[0m
        ]
      },
      {
        productId: [32m"2"[0m,
        name: [32m"Mike's Ski Shop Exclusive"[0m,
        

In [None]:
car = {
    "type": "object",
    "properties": {
        "make": {"type": "string"},
        "model": {"type": "choice_probs", "enum": ["Mazda", "Kea"]},
        "new": {"type": "choice_probs", "enum": ["true", "false"]},
        "rating": {"type": "choice_probs", "enum": ["1", "2", "3", "4"]},
        "year": {"type": "number"},
        "colors_available": {
            "type": "array",
            "items": {"type": "string"},
        },
    },
}

builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=car,
    prompt="generate an example car",
)

print("Generating...")
output = builder()

highlight_values(output)

In [None]:
complex_car = {
    "type": "object",
    "properties": {
        "car": {
            "type": "object",
            "properties": {
                "make": {"type": "string"},
                "model": {"type": "string"},
                "year": {"type": "number"},
                "colors": {"type": "choice_probs", "enum": ["red", "green", "blue", "black", "white"]},
                "as_new": {"type": "choice_probs", "enum": ["true", "false"]},
                "rating": {"type": "choice_probs", "enum": ["1", "2", "3", "4"]},
                "features": {
                    "type": "object",
                    "properties": {
                        "audio": {
                            "type": "object",
                            "properties": {
                                "brand": {"type": "string"},
                                "speakers": {"type": "number"},
                                "hasBluetooth": {"type": "boolean"},
                            },
                        },
                        "safety": {
                            "type": "object",
                            "properties": {
                                "airbags": {"type": "number"},
                                "parkingSensors": {"type": "boolean"},
                                "laneAssist": {"type": "boolean"},
                            },
                        },
                        "performance": {
                            "type": "object",
                            "properties": {
                                "engine": {"type": "string"},
                                "horsepower": {"type": "number"},
                                "topSpeed": {"type": "number"},
                            },
                        },
                    },
                },
            },
        },
        "owner": {
            "type": "object",
            "properties": {
                "firstName": {"type": "string"},
                "lastName": {"type": "string"},
                "age": {"type": "number"},
            },
        },
    },
}
builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=complex_car,
    prompt="generate an example Rolls Royce Phantom",
)

print("Generating...")
output = builder()

highlight_values(output)

## Readme example

In [None]:
from prob_jsonformer import Jsonformer
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "databricks/dolly-v2-3b"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [1]:
json_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string", "maxLength": 4},
        "age_probs": {"type": "choice_probs", "enum": [str(s) for s in range(10, 30)]},
        "age_wmean": {"type": "range_mean", "minimum": 10, "maximum": 30},
        "is_student_probs": {"type": "choice_probs", "enum": ["true", "false"]},
        "is_student": {"type": "boolean"},
        "age": {"type": "integer"},
        "unit_time": {"type": "number"},
        "courses": {
            "type": "array",
            "items": {"type": "string"}
        },
        "trim": {"type": ["string", "null"]},
        "color": {
            "type": "enum",
            "values": ["red", "green", "blue", "brown", "white", "black"],
        },
    }
}

prompt = "Generate a young person's information based on the following schema:"
jsonformer = Jsonformer(model, tokenizer, json_schema, prompt, temperature=0)
generated_data = jsonformer()

generated_data

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'name': 'John',
 'age': 20,
 'age_probs': [{'prob': 0.856144905090332, 'choice': '12'},
  {'prob': 0.045701637864112854, 'choice': '10'},
  {'prob': 0.030096691101789474, 'choice': '20'},
  {'prob': 0.01899518258869648, 'choice': '11'},
  {'prob': 0.013291668146848679, 'choice': '16'},
  {'prob': 0.013288195244967937, 'choice': '14'},
  {'prob': 0.011642636731266975, 'choice': '18'},
  {'prob': 0.005356263369321823, 'choice': '15'},
  {'prob': 0.0035301733296364546, 'choice': '13'},
  {'prob': 0.0010820770403370261, 'choice': '21'},
  {'prob': 0.0003798121470026672, 'choice': '19'},
  {'prob': 0.0002950581256300211, 'choice': '17'},
  {'prob': 7.64212163630873e-05, 'choice': '22'},
  {'prob': 4.703202284872532e-05, 'choice': '23'},
  {'prob': 2.3594444428454153e-05, 'choice': '25'},
  {'prob': 1.987080577237066e-05, 'choice': '24'},
  {'prob': 1.821534169721417e-05, 'choice': '26'},
  {'prob': 9.411132850800641e-06, 'choice': '28'},
  {'prob': 7.120665941329207e-07, 'choice': '27'},
 