In [42]:
import pandas as pd
import nbformat
import random
from openai import OpenAI
from typing import List
from dotenv import load_dotenv
import os
import json

load_dotenv()


client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
taxonomies = pd.read_csv("taxonomies.csv")

conversation_lengths = [2, 3]
conversation_weights = [0.5, 0.5]

AVAILABLE_INSTRUCTIONS = [
    "change_case:all_caps",
    "change_case:lowercase",
    "change_case:alternating",
    "change_case:first_letter_cap",
    "change_case:capital_word_frequency",
    "change_case:lowercase_word_frequency",
    "change_case:all_caps_target",
    "change_case:lowercase_target",
    "change_case:alternating_target",
    "change_case:first_letter_cap_target",
    "detectable_content:number_placeholders",
    "detectable_content:postscript",
    "detectable_format:json_format",
    "detectable_format:multiple_sections",
    "detectable_format:numbered_list",
    "detectable_format:number_bullet_lists",
    "detectable_format:title",
    "keywords:existence",
    "keywords:frequency",
    "keywords:forbidden_words",
    "keywords:letter_frequency",
    "punctuation:no_comma",
    "length_constraints:number_characters",
    "length_constraints:number_words",
    "length:max_word_count",
    "startend:start_checker",
    "startend:end_checker",
    "startend:wrap_checker",
    "startend:quotation"
]


In [43]:
def generate_scenario_with_gpt(l1_taxonomy: str, l1_description: str, l2_taxonomy: str, l2_description: str) -> str:
    """Generate a scenario using GPT based on taxonomy information."""
    prompt = f"""Generate a realistic scenario for a complex instruction following task with the following taxonomy:

L1 Taxonomy: {l1_taxonomy}
L1 Description: {l1_description}
L2 Taxonomy: {l2_taxonomy}
L2 Description: {l2_description}

The scenario should be specific, realistic, and provide clear context for instruction following. Keep it concise but detailed enough to understand the context.

Give one single paragraph after the tag **Scenario:** - and make it clear and concise and mention directly what the user would ask the LLM for.

**Scenario:** - 
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that generates realistic scenarios for tasks that a user would have in a conversation with an AI assistant that involve the provided L1 and L2 taxonomies."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0.7
    )
    
    return response.choices[0].message.content.strip()

In [44]:
def select_relevant_instructions(l1_taxonomy: str, l2_taxonomy: str, scenario: str) -> List[str]:
    """Select relevant instructions based on taxonomy and scenario."""
    prompt = f"""Given the following taxonomy and scenario, select 6 most relevant instruction IDs from the available list:

L1 Taxonomy: {l1_taxonomy}
L2 Taxonomy: {l2_taxonomy}
Scenario: {scenario}

Available instructions: {json.dumps(AVAILABLE_INSTRUCTIONS)}

Return only the instruction IDs as a JSON array, no explanation needed.

**Instructions:** - 
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that selects relevant instructions for scenarios. Wrap in double quotes and in a json array."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.3
    )
    
    try:
        selected_instructions = json.loads(response.choices[0].message.content.strip())
        return selected_instructions[:6]  # Ensure we only get 6 instructions
    except:
        # Fallback to random selection if parsing fails
        return random.sample(AVAILABLE_INSTRUCTIONS, 6)


In [47]:
def create_notebook(row, index):
    convo_length = random.choices(conversation_lengths, conversation_weights)[0]

    metadata_md = f"""# Metadata

**Domain:** - Complex Instruction Following

**L1 Taxonomy:** - {row['L1 Taxonomy']}

**L1 Taxonomy Description:** - {row['L1 Taxonomy Description']}

**L2 Taxonomy:** - {row['L2 Taxonomy']}

**L2 Taxonomy Description:** - {row['L2 Taxonomy Description']}

**Conversation Length:** - {convo_length} Turn Tasks

{generate_scenario_with_gpt(row['L1 Taxonomy'], row['L1 Taxonomy Description'], row['L2 Taxonomy'], row['L2 Taxonomy Description'])}

**Instruction:** - 
```
{select_relevant_instructions(row['L1 Taxonomy'], row['L2 Taxonomy'], generate_scenario_with_gpt(row['L1 Taxonomy'], row['L1 Taxonomy Description'], row['L2 Taxonomy'], row['L2 Taxonomy Description']))}
```

"""

    cells = [
        nbformat.v4.new_markdown_cell(metadata_md),
        nbformat.v4.new_markdown_cell("**[user]**\n\n// Please begin your conversation from here (Delete this comment post reading)"),
        nbformat.v4.new_markdown_cell("""**[turn_metadata]**

```
{
  "metadata": [
    "add"
  ],
  "instructions": [
    {
      "instruction_id": "",
      "kwarg1_name": "kwarg1_value",
      "kwarg2_name": "kwarg2_value"
    }
  ]
}
```
                                      
                                      """),
        nbformat.v4.new_markdown_cell("**[assistant]**"),
        nbformat.v4.new_markdown_cell("**[user]**"),
        nbformat.v4.new_markdown_cell("""**[turn_metadata]**

```
{
  "metadata": [
    "add"
  ],
  "instructions": [
    {
      "instruction_id": "",
      "kwarg1_name": "kwarg1_value",
      "kwarg2_name": "kwarg2_value"
    }
  ]
}
```
"""),
        nbformat.v4.new_markdown_cell("**[assistant]**"),
    ]

    nb = nbformat.v4.new_notebook(cells=cells)
    filename = f"notebooks-output/{"multi-turns" if convo_length > 1 else "single-turn"}-,,,{row['L1 Taxonomy']},misc-{index}.ipynb"
    with open(filename, "w", encoding="utf-8") as f:
        nbformat.write(nb, f)
    return filename

In [53]:
notebook_files = [create_notebook(row, idx) for idx, row in taxonomies.sample().iterrows()]

notebook_files[:5]

['notebooks-output/multi-turns-,,,Rewriting,misc-43.ipynb',
 'notebooks-output/multi-turns-,,,Rewriting,misc-42.ipynb',
 'notebooks-output/multi-turns-,,,Open-Domain Question Answering,misc-46.ipynb',
 'notebooks-output/multi-turns-,,,Open-Domain Question Answering,misc-48.ipynb',
 'notebooks-output/multi-turns-,,,Summarization,misc-37.ipynb']