In [1]:
import json
import outlines
from transformers import AutoTokenizer
import torch
from textwrap import dedent
from enum import Enum
import re
import random

In [2]:
MODEL_NAME = "microsoft/Phi-3-medium-4k-instruct"

In [3]:
class Department(str, Enum):
    clothing = "clothing"
    electronics = "electronics"
    kitchen = "kitchen"
    automotive = "automotive"

DEFAULT_DEPTS = [dept.name for dept in list(Department)]
DEFAULT_DEPTS

['clothing', 'electronics', 'kitchen', 'automotive']

## Step 1 - Draft Structure

See the `intro_structure` property to get started

In [11]:
from copy import deepcopy
class ComplaintGenerator:

    def __init__(self, model_name, departments=DEFAULT_DEPTS):
        self.model_name = model_name
        self.departments = departments
        self._model = None
        self._tokenizer = None
        self._intro_generator = None
        self._complaint_generator = None
        self._order_number_generator = None
    ####################################
    # Structured Generation Section
    #
    @property
    def intro_structure(self):
        possible_intros = [
            r'(Hi! This is [A-Z][a-z]{3,10} [A-z][a-z]{3,10})\.',
            r'(Hi, my name is [A-Z][a-z]{3,10} [A-z][a-z]{3,10})\.',
        ]
        return rf"({'|'.join(possible_intros)})"        

    @property
    def complaint_structure(self):
        return r'I recently ordered [\w\s,.!\n]{120,240}\.'

    @property
    def order_number_structure(self):
        possible_order_numbers = [
             r'(My order was (A|D|Z)[0-9]{6})',
             r'(This is order (A|D|Z)[0-9]{2}-[0-9]{4})',
             r'(The order number is (A|D|Z)[0-9]{2}-[0-9]{4})'
         ]
        return rf"({'|'.join(possible_order_numbers)})"
    #
    #
    ####################################

    
    @property
    def intro_generator(self):
        if self._intro_generator is None:
            self._intro_generator = outlines.generate.regex(
                self.model, self.intro_structure
            )
        return self._intro_generator
        
    @property
    def complaint_generator(self):
        if self._complaint_generator is None:
            self._complaint_generator = outlines.generate.regex(self.model, self.complaint_structure)
        return self._complaint_generator

    @property
    def order_number_generator(self):
        if self._order_number_generator is None:
            self._order_number_generator = outlines.generate.regex(
                self.model, 
                self.order_number_structure)
        return self._order_number_generator
    
    @property
    def model(self):
        print("getting model")
        if self._model is None:
            print("loading model")
            self._model = outlines.models.transformers(
                self.model_name,
                model_kwargs={
                    #'torch_dtype': torch.bfloat16,
                    'trust_remote_code': True
                },)
        return self._model

    @property
    def tokenizer(self):
        if self._tokenizer is None:
            print("loading tokenizer")
            self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        return self._tokenizer
        
    def generate_complaint(self):
        prompt_messages = self._start_messages()
        prompt_messages.append(self._intro_prompt())
        prompt_intro = self.tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=False
        )
        print("Generating intro")
        intro_result = self.intro_generator(prompt_intro)
        prompt_messages.append({
            "role": "agent",
            "content": intro_result
        })
        print("Generating Complaint")
        department = random.choice(self.departments)
        prompt_messages.append(self._complaint_prompt(department))
        prompt_complaint = self.tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=False
        )
        complaint_result = self.complaint_generator(prompt_complaint)
        prompt_messages.append({
            "role": "agent",
            "content": complaint_result
        })
        prompt_messages.append(self._order_number_prompt())
        prompt_order_number = self.tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=False
        )
        print("Generating order number")
        order_number_result = self.order_number_generator(prompt_order_number)

        final_message = intro_result + complaint_result + order_number_result
        return {
            "message": final_message,
            "order_number": self.parse_order_number(order_number_result),
            "department": department
        }
    
    def parse_order_number(self, message):
        """
        We want to extract the order number so that we can 
        send it back with the response to use for validation later.
        """
        number_only = r'((A|D|Z)[0-9]{6})|((A|D|Z)[0-9]{2}-[0-9]{4})'
        order_number = re.search(number_only, message)[0]
        if not ("-" in order_number):
            order_number = f"{order_number[0:3]}-{order_number[3:]}"
        return order_number
        
    def _start_messages(self):
        """
        These are the starting prompt messages, since we'll be
        appending to these messages, we'd like to return a 
        copy of them.
        """
        prompt_messages = [{
            "role": "user",
            "content": dedent("""
            You are an agent designed to create simulated customer complaints. The
            complaints are essentially short text messages that describe a customer,
            their problem, and provide an order number.
        
            You will build the complaint in parts based on the user request. The
            complaint will be about a product from a specified department, but you
            will not mention the department name directly.
        
            For example, if you are asked about something from the 'kitchen' department 
            you might mention an 'knife' but you won't mention the department.
            """)
        },{ 
            "role": "agent",
            "content": dedent("""
            I understand the task, and will wait for the you to instruct me on
            next steps.
            """)
        }]
        return(deepcopy(prompt_messages))

    def _intro_prompt(self):
        intro_prompt = {
            "role": "user",
            "content": "Start the message with a short intro stating the customer's name."
        }
        return(deepcopy(intro_prompt))

    def _complaint_prompt(self, department):
        complaint_message = {
            "role":"user", 
            "content": dedent(f"""
                            Good! Now write a short description of the problem with an item from the {department} department,
                            but don't mention the actual name of the department the product comes from!
                            """)
        }
        return deepcopy(complaint_message)

    def _order_number_prompt(self):
        order_number_message = {
            "role": "user",
            "content": dedent("""
            Finally, add a statement about the order number which starts with letter 'A', 'D' or 'Z' and consists of 6 digits after.
            """)
        }
        return deepcopy(order_number_message)
        
    

    
    

In [6]:
complainer = ComplaintGenerator(MODEL_NAME)
# complainer.generate_complaint()

## Step 2 - Verify Structure 

We can now test that this structure indeed matches the real data we have. To start we're only going to test the `intro_structure` property. We always want to start with real data:

In [4]:
with open("../examples.json",'r') as fin:
    complaint_data = json.loads(fin.read())

The we make sure that our structure indeed does match all of the messages in our data set:

In [23]:
all([re.search(complainer.intro_structure, complaint['message'])
     for complaint in complaint_data])

True

In [None]:
example_generation = complainer.generate()

## Step 3 - Generate Structure

Rather than run the model right now, we'll use an example generated earlier

In [18]:
# Normally we would do the following...
# example_generation = complainer.generate()
example_generation = {
 'message': 'Hi, my name is Emily andbuyerser.I recently ordered a laptop with an extended warranty, but upon arrival, I noticed a malfunctioning trackpad. Despite numerous attempts at troubleshooting, the issue persists, greatly hindering my everyday use.This is order A12-3456',
 'order_number': 'A12-3456',
 'department': 'electronics'
}

## Step 4 - Inspect Output

Uh oh! Look at the name output! `Emily andbuyerser` is not a name that I would expect and doesn't match the expected output!

Now it's *your turn* to fix it!

When you've found the bug you can continue on to the next sections:

- Finish the `complaint_structure`, repeating this process
- Finish the `order_number_structure`, repeating this process
- If you have time, generate some new complaints!

In [7]:
complaints = [complainer.generate_complaint() for _ in range(50)]
complaints

loading tokenizer


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating intro
getting model
loading model


`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are not running the flash-attention implementation, expect numerical differences.


Generating Complaint
getting model
Generating order number
getting model
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order number
Generating intro
Generating Complaint
Generating order numb

[{'message': 'Hi, my name is Olivia Brown.I recently ordered a knife set from your wellness range, and it arrived earlier this week. Unfortunately, my satisfaction with the product has been less than ideal.My order was A123456',
  'order_number': 'A12-3456',
  'department': 'kitchen'},
 {'message': 'Hi, my name is John Smith.I recently ordered a dress for an upcoming event, which was alleged to meet my expectations both in fit and style. However, upon arrival, it became apparent that the fabric was of subpar quality, leading to a less than satisfactory appearance.The order number is A12-3456',
  'order_number': 'A12-3456',
  'department': 'clothing'},
 {'message': 'Hi, my name is Sarah Johnson.I recently ordered the ultimate ChefMaster 8 Drawer Cooktop. However, upon delivery, I discovered that one of the burners is malfunctioning.My order was A458739',
  'order_number': 'A45-8739',
  'department': 'kitchen'},
 {'message': 'Hi, my name is Jane Doeandcommn.I recently ordered a stylish b

In [8]:
import json
with open("examples.json", 'w') as fout:
    fout.write(json.dumps(complaints))

In [21]:
complaint_data[0]

{'message': 'Hi, my name is Olivia Brown.I recently ordered a knife set from your wellness range, and it arrived earlier this week. Unfortunately, my satisfaction with the product has been less than ideal.My order was A123456',
 'order_number': 'A12-3456',
 'department': 'kitchen'}