In [1]:
import json
import outlines
from transformers import AutoTokenizer
import torch
from textwrap import dedent

In [2]:
model_name = "microsoft/Phi-3-medium-4k-instruct"

In [3]:
model = outlines.models.transformers(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [4]:
generator = outlines.generate.text(model)

In [5]:
example_text = dedent("""
    John Smith here, I recently purchased a pair of socks but unfortunately they do not match! 
    My order was D123454
""").strip()

In [6]:
example_text

'John Smith here, I recently purchased a pair of socks but unfortunately they do not match! \nMy order was D123454'

## Generating data



In [40]:
import re

prompt_messages = [{
    "role": "user",
    "content": dedent("""
    You are an agent designed to create simulated customer complaints. The
    complaints are essentially short text messages that describe a customer,
    their problem, and provide an order number.

    You will build the complaint in parts based on the user request. The
    complaint will be about a product from a specified department, but you
    will not mention the department name directly.

    For example, if you are asked about something from the 'kitchen' department 
    you might mention an 'knife' but you won't mention the department.
    """)
},{ 
    "role": "agent",
    "content": dedent("""
    I understand the task, and will wait for the you to instruct me on
    next steps.
    """)
}]

intro_prompt = {
    "role": "user",
    "content": "Start the message with a short intro stating the customer's name."
}
possible_intros = [
    r'(Hi! This is [A-z][a-z]{3,10} [A-z][a-z]{3,10})\.',
    r'(Hey, my name is [A-z][a-z]{3,10} [A-z][a-z]{3,10})\.',
]
name_section = rf"({'|'.join(possible_intros)})"

generator_name_section = outlines.generate.regex(model, name_section)

prompt_messages.append(intro_prompt)
prompt_1 = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
prompt_1

Compiling FSM index for all state transitions: 100%|██████████████████| 47/47 [00:00<00:00, 139.47it/s]


"<|user|>\n\nYou are an agent designed to create simulated customer complaints. The\ncomplaints are essentially short text messages that describe a customer,\ntheir problem, and provide an order number.\n\nYou will build the complaint in parts based on the user request. The\ncomplaint will be about a product from a specified department, but you\nwill not mention the department name directly.\n\nFor example, if you are asked about something from the 'kitchen' department \nyou might mention an 'knife' but you won't mention the department.\n<|end|>\n<|assistant|>\n<|user|>\nStart the message with a short intro stating the customer's name.<|end|>\n<|assistant|>\n"

In [42]:
intro_result = generator_name_section(prompt_1)
intro_result

'Hi! This is Emily Thompson.'

In [43]:
prompt_messages.append({
    "role": "agent",
    "content": intro_result,
})

In [49]:
department="electronics"
complaint_prompt = {
    "role":"user", 
    "content": dedent(f"""
                    Good! Now write a short description of the problem with an item from the {department} department,
                    but don't mention the actual name of the department the product comes from!
                    """)
}
prompt_messages.append(complaint_prompt)

prompt_2 = tokenizer.apply_chat_template(prompt_messages, tokenize=False)

complaint_section = r'I recently ordered [\w\s,.!\n]{120,240}'
generator_complaint = outlines.generate.regex(model, complaint_section)



Compiling FSM index for all state transitions: 100%|█████████████████| 260/260 [00:03<00:00, 67.92it/s]


In [51]:
complaint_result = generator_complaint(prompt_2)
prompt_messages.append({
    "role": "agent",
    "content": complaint_result
})

In [55]:
possible_order_numbers = [
     r'(My order was (A|D|Z)[0-9]{6})',
     r'(This is order (A|D|Z)[0-9]{2}-[0-9]{4})',
     r'(The order number is (A|D|Z)[0-9]{2}-[0-9]{4})'
 ]

order_number_section = rf"({'|'.join(possible_order_numbers)})"

order_number_message = {
    "role": "user",
    "content": dedent("""
    Finally, add a statement about the order number which starts with letter 'A', 'D' or 'Z' and consists of 6 digits after.
    """)
}
prompt_messages.append(order_number_message)

prompt_3 = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
generator_order_number = outlines.generate.regex(model, order_number_section);

Compiling FSM index for all state transitions: 100%|██████████████████| 54/54 [00:00<00:00, 212.10it/s]


In [57]:
order_number = generator_order_number(prompt_3)
order_number

'My order was Z432101'

In [8]:
r'['+name_section+r']'+complaint_section

'[((Hi! This is [A-z][a-z]{3,10} [A-z][a-z]{3,10})|([A-z][a-z]{3,10} [A-z][a-z]{3,10} here)|(Hey, my name is [A-z][a-z]{3,10} [A-z][a-z]{3,10}))][\\w\\s,.!\\n]{1,240}'

In [9]:
re.match(name_section + complaint_section + order_number_section, example_text)[0]

'John Smith here, I recently purchased a pair of socks but unfortunately they do not match! \nMy order was D123454'

In [10]:
messages = [
    {"role": "user",
     "content": """
     Generate a text message complaint from a user about an item you would find in the electronics department,
     the user provides their name and order number
     """}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False)
generator(prompt, max_tokens=40)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
You are not running the flash-attention implementation, expect numerical differences.


'Subject: Urgent Issue with My Recent Electronics Order - #12345\n\nDear Electronics Department Team,\n\nI hope this message finds you well. I'

In [11]:
generator_struct = outlines.generate.regex(model, complaint_section)

Compiling FSM index for all state transitions: 100%|█████████████████| 241/241 [00:03<00:00, 67.21it/s]


In [15]:
generator_struct(prompt)

'Hi'

In [3]:
from pydantic import BaseModel, Field, constr
from enum import Enum


class Department(str, Enum):
    clothing = "clothing"
    electronics = "electronics"
    kitchen = "kitchen"
    automotive = "automotive"

class ComplaintData(BaseModel):
    first_name: str
    last_name: str
    order_number: str = Field(pattern=r'[ADZ][0-9]{2}-[0-9]{4}')
    department: Department



In [5]:
"|".join([e.value for e in Department])

'clothing|electronics|kitchen|automotive'

In [None]:
complaint_processor = outlines.generate.json(model, ComplaintData)

In [7]:
complaint_messages = [
    {
    'role': 'systems',
    'content': """
    You are a complaint processing assistent, you aim is to process complaints and return the following intformation in this JSON format:
    {
        'first_name': <first name>,
        'last_name': <last name>,
        'order number': <order number has the following format (ADZ)XX-XXXXX>,
        'department': <{"|".join([e.value for e in Department])}>,
        'summary': <short summary of the complaint,
    }
    """,
    'role': 'user',
    'content': "test"
    }
]

complaint_prompt = tokenizer.apply_chat_template(complaint_messages, tokenize=False)

NameError: name 'tokenizer' is not defined

In [None]:
complaint_processor(complaint_prompt)