In [28]:
import json
import nbformat
import os

def get_cell_text(cell):
    """Safely join multiline or string cell contents."""
    return ''.join(cell['source']) if isinstance(cell['source'], list) else cell['source']

def is_user_cell(cell):
    return cell['cell_type'] == 'markdown' and '**[user]**' in get_cell_text(cell)

def is_assistant_cell(cell):
    return cell['cell_type'] == 'markdown' and '**[assistant]**' in get_cell_text(cell)

def is_metadata_cell(cell):
    return cell['cell_type'] == 'markdown' and '**[turn_metadata]**' in get_cell_text(cell)

def extract_json_from_metadata_cell(source_text):
    try:
        start = source_text.find("```")
        end = source_text.rfind("```")
        if start == -1 or end == -1 or start == end:
            return {}
        json_str = source_text[start+3:end].strip()
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print("⚠️ JSON decode error:", e)
        return {}

def process_notebook(file_path, dialogue_id=None):
    with open(file_path, "r", encoding="utf-8") as f:
        nb = nbformat.read(f, as_version=4)

    turns = []
    cells = nb['cells']
    i = 0

    while i < len(cells) - 2:
        user_cell = cells[i]
        print(user_cell)
        metadata_cell = cells[i + 1]    
        print(metadata_cell)
        assistant_cell = cells[i + 2]
        print(assistant_cell)

        print(is_user_cell(user_cell))
        print(is_assistant_cell(assistant_cell))
        print(is_metadata_cell(metadata_cell))

        if is_user_cell(user_cell) and is_assistant_cell(assistant_cell) and is_metadata_cell(metadata_cell):
            prompt = get_cell_text(user_cell).replace('**[user]**', '').strip()
            response = get_cell_text(assistant_cell).replace('**[assistant]**', '').strip()
            metadata_text = get_cell_text(metadata_cell)
            instruction_data = extract_json_from_metadata_cell(metadata_text)
            print(instruction_data)
            print(prompt)
            print(response)
            print(metadata_text)

            turns.append({
                "prompt": prompt,
                "response": response,
                "instructions": [
                    {
                        "instruction_id_list": instruction_data.get("instruction_id_list", []),
                        "kwargs": instruction_data.get("kwargs", [])
                    }
                ]
            })

            i += 3  # Advance to next triplet
        else:
            i += 1  # Skip malformed or extra cells

    return {
        "turns": turns,
        "dialogue_metadata": {
            "id": dialogue_id or os.path.basename(file_path),
            "length": len(turns)
        }
    }


In [29]:
markdown_sample = process_notebook('./samples/sample-1.ipynb')

{'cell_type': 'markdown', 'source': '**[turn_metadata]**\n\n```\n{\n  "instruction_id_list": [\n    "startend:start_checker",\n    "startend:end_checker",\n    "detectable_format:multiple_sections",\n    "change_case:first_letter_cap_target",\n    "keywords:frequency",\n    "punctuation:no_comma",\n    "length_constraints:number_characters",\n    "startend:quotation"\n  ],\n  "kwargs": [\n    {\n      "start_phrase": "Report Start:"\n    },\n    {\n      "end_phrase": "Report End."\n    },\n    {\n      "section_splitter": "###",\n      "relation": "at least",\n      "num_sections": 4\n    },\n    {\n      "target_string": "error"\n    },\n    {\n      "keyword": "system",\n      "relation": "at least",\n      "frequency": 3\n    },\n    {\n      "relation": "less than",\n      "num_chars": 1000\n    }\n  ]\n}\n```', 'metadata': {'id': '83XNTJZMkvy0'}}
True
True
True
{'instruction_id_list': ['startend:start_checker', 'startend:end_checker', 'detectable_format:multiple_sections', 'chang

In [30]:
markdown_sample

   'instructions': [{'instruction_id_list': ['startend:start_checker',
      'startend:end_checker',
      'detectable_format:multiple_sections',
      'change_case:first_letter_cap_target',
      'keywords:frequency',
      'punctuation:no_comma',
      'length_constraints:number_characters',
      'startend:quotation'],
     'kwargs': [{'start_phrase': 'Report Start:'},
      {'end_phrase': 'Report End.'},
      {'section_splitter': '###', 'relation': 'at least', 'num_sections': 4},
      {'target_string': 'error'},
      {'keyword': 'system', 'relation': 'at least', 'frequency': 3},
      {'relation': 'less than', 'num_chars': 1000}]}]},
   'instructions': [{'instruction_id_list': ['startend:start_checker',
      'startend:end_checker',
      'detectable_format:numbered_list',
      'change_case:first_letter_cap_target',
      'keywords:frequency',
      'punctuation:no_comma',
      'length_constraints:number_characters',
      'startend:quotation'],
     'kwargs': [{'start_phrase'

In [14]:
import nbformat

with open("samples/sample-1.ipynb", "r", encoding="utf-8") as f:
    nb = nbformat.read(f, as_version=4)

for i, cell in enumerate(nb['cells']):
    if cell['cell_type'] == 'markdown':
        print(f"\n--- Cell {i} ---")
        print("".join(cell['source'])[:300])  # Print first 300 characters



--- Cell 0 ---
**[user]**


Generate a comprehensive system diagnostic report from the following raw log entries.  
- Begin with Report Start: and end with Report End.  
- Capitalize the first letter of the wor

--- Cell 1 ---
**[turn_metadata]**

```
{
  "instruction_id_list": [
    "startend:start_checker",
    "startend:end_checker",
    "detectable_format:multiple_sections",
    "change_case:first_letter_cap_target",
    "keywords:frequency",
    "punctuation:no_comma",
    "length_constraints:number_characters",
    

--- Cell 2 ---
**[assistant]**

"Report Start:

### Overview

The system began its boot sequence at 08:15:23 on 2025-05-16. Initial operations showed standard activity. A minor issue appeared early but recovery was rapid.

### Metrics

The system booted and rebooted successfully
Latency detected in module A
Two Er

--- Cell 3 ---
**[user]**


--- Cell 4 ---
**[turn_metadata]**

```
{
  "instruction_id_list": [
    "startend:start_checker",
    "startend:end_checker"