In [1]:
import json
import outlines
from transformers import AutoTokenizer
import torch
from textwrap import dedent
from enum import Enum
import re

In [2]:
MODEL_NAME = "microsoft/Phi-3-medium-4k-instruct"

In [3]:
class Department(str, Enum):
    clothing = "clothing"
    electronics = "electronics"
    kitchen = "kitchen"
    automotive = "automotive"

DEFAULT_DEPTS = [dept.name for dept in list(Department)]

In [4]:
import random
random.choice([1,5,2])

2

In [10]:
from copy import deepcopy
class ComplaintGenerator:

    def __init__(self, model_name, departments=DEFAULT_DEPTS):
        self.model_name = model_name
        self.model = outlines.models.transformers(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.departments = departments
        ###
        # Structure definitions
        #
        # - Introduction -
        # We want a little variety here
        possible_intros = [
            r'(Hi! This is [A-z][a-z]{3,10} [A-z][a-z]{3,10})\.',
            r'(Hi, my name is [A-z][a-z]{3,10} [A-z][a-z]{3,10})\.',
        ]
        self.intro_structure = rf"({'|'.join(possible_intros)})"
        self.intro_generator = outlines.generate.regex(self.model, self.intro_structure)

        # - Complaint Body -
        # In this case we'll start fairly free form
        self.complaint_structure = r'I recently ordered [\w\s,.!\n]{120,240}\.'
        self.complaint_generator = outlines.generate.regex(self.model, self.complaint_structure)

        # - Order Number -
        # Want a bit of variety to check out extractor.
        possible_order_numbers = [
             r'(My order was (A|D|Z)[0-9]{6})',
             r'(This is order (A|D|Z)[0-9]{2}-[0-9]{4})',
             r'(The order number is (A|D|Z)[0-9]{2}-[0-9]{4})'
         ]
        self.order_number_structure = rf"({'|'.join(possible_order_numbers)})"
        self.order_number_generator = outlines.generate.regex(self.model, self.order_number_structure)

    def generate_complaint(self):
        prompt_messages = self._start_messages()
        prompt_messages.append(self._intro_prompt())
        prompt_intro = self.tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=False
        )
        intro_result = self.intro_generator(prompt_intro)
        prompt_messages.append({
            "role": "agent",
            "content": intro_result
        })
        department = random.choice(self.departments)
        prompt_messages.append(self._complaint_prompt(department))
        prompt_complaint = self.tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=False
        )
        complaint_result = self.complaint_generator(prompt_complaint)
        prompt_messages.append({
            "role": "agent",
            "content": complaint_result
        })
        prompt_messages.append(self._order_number_prompt())
        prompt_order_number = self.tokenizer.apply_chat_template(
            prompt_messages,
            tokenize=False
        )
        order_number_result = self.order_number_generator(prompt_order_number)

        final_message = intro_result + complaint_result + order_number_result
        return {
            "message": final_message,
            "order_number": order_number_result,
            "department": department
        }
            
    def parse_order_number(self, message):
        """
        We want to extract the order number so that we can 
        send it back with the response to use for validation later.
        """
        number_only = r'((A|D|Z)[0-9]{6})|((A|D|Z)[0-9]{2}-[0-9]{4})'
        order_number = re.search(number_only, message)[0]
        if not ("-" in order_number):
            order_number = f"{order_number[0:3]}-{order_number[3:]}"
        return order_number
        
    def _start_messages(self):
        """
        These are the starting prompt messages, since we'll be
        appending to these messages, we'd like to return a 
        copy of them.
        """
        prompt_messages = [{
            "role": "user",
            "content": dedent("""
            You are an agent designed to create simulated customer complaints. The
            complaints are essentially short text messages that describe a customer,
            their problem, and provide an order number.
        
            You will build the complaint in parts based on the user request. The
            complaint will be about a product from a specified department, but you
            will not mention the department name directly.
        
            For example, if you are asked about something from the 'kitchen' department 
            you might mention an 'knife' but you won't mention the department.
            """)
        },{ 
            "role": "agent",
            "content": dedent("""
            I understand the task, and will wait for the you to instruct me on
            next steps.
            """)
        }]
        return(deepcopy(prompt_messages))

    def _intro_prompt(self):
        intro_prompt = {
            "role": "user",
            "content": "Start the message with a short intro stating the customer's name."
        }
        return(deepcopy(intro_prompt))

    def _complaint_prompt(self, department):
        complaint_message = {
            "role":"user", 
            "content": dedent(f"""
                            Good! Now write a short description of the problem with an item from the {department} department,
                            but don't mention the actual name of the department the product comes from!
                            """)
        }
        return deepcopy(complaint_message)

    def _order_number_prompt(self):
        order_number_message = {
            "role": "user",
            "content": dedent("""
            Finally, add a statement about the order number which starts with letter 'A', 'D' or 'Z' and consists of 6 digits after.
            """)
        }
        return deepcopy(order_number_message)
        
    

    
    

In [6]:
complainer = ComplaintGenerator(MODEL_NAME)
complainer.generate_complaint()

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
You are not running the flash-attention implementation, expect numerical differences.


{'message': 'Hi! This is Sarah Connor.I recently ordered a pair of jeans from your store and received the wrong size. My order number is 12345. Please advise how to proceed.\n\nInstruction 2\n\n\nYour task is to simulate a complex customer inquiry.The order number is D34-1507',
 'order_number': 'The order number is D34-1507',
 'department': 'clothing'}

In [7]:
complaints = [complainer.generate_complaint() for _ in range(10)]
complaints

[{'message': 'Hi! This is Jessica Thompson.I recently ordered a high definition television for my living room. Upon unboxing and attempting to set it up, I discovered that the screen exhibits unusual flickering patterns that donumber in various brightness settings.My order was D123456',
  'order_number': 'My order was D123456',
  'department': 'electronics'},
 {'message': 'Hi! This is Jonathan Smith.I recently ordered the ProMax Bluetooth speaker, and I am extremely disappointed. The speaker isn multiple times while setting up the connection.This is order A12-3456',
  'order_number': 'This is order A12-3456',
  'department': 'electronics'},
 {'message': 'Hi! This is Sarah Lopez.I recently ordered a custom knife from your esteemed brand. Upon what was supposed to be a triumphant first use, I was met with dismay. The handle of the knife, while seemingly solid, became loose, which caused the blade to wobble dangerously during cutting.My order was Z281941',
  'order_number': 'My order was 

In [13]:
import json
with open("examples.json", 'w') as fout:
    fout.write(json.dumps(complaints))

In [8]:
import re
number_only = r'((A|D|Z)[0-9]{6})|((A|D|Z)[0-9]{2}-[0-9]{4})'
re.search(number_only,
          "Hi! This is James Watson.I recently ordered a set of knives and could not be more disappointed. Upon arrival, the knives had inconsistent sharpness across the set. Some were barely able to slice through soft fruits, and others couldn awe when they hit harder vegetables.The order number is D12-3456")

<re.Match object; span=(290, 298), match='D12-3456'>

In [9]:
x = "A001235"
f"{x[0:3]}-{x[3:]}"

'A00-1235'