# Create COT, Detection, and Rewriting Tuning Datasets

In [11]:
import json

# {'instruction': 'Can you recount a detailed memory of the first time you saw a sonogram of a baby, describing what you observed and felt during the experience?',
#  'source': 'gpt4o',
#  'type': 'Literary Fiction',
#  'postedit': 'The soft hum of machinery filled the room with an unfamiliar yet gentle melody. I sat beside Lila, squeezing her hand as she lay on the table while the technician swirled the wand over her rounded belly. The screen beside us flickered to life, a grainy black and white. Slowly, an image began to form; the unmistakable curve of a tiny head, the flutter of something I later learned was a heartbeat. I leaned forward, almost holding my breath, mesmerized by the play of light and shadows that composed this new life. Lila\'s grip tightened and upon glancing at her, I found her eyes glossy, along with a smile tugging at the corners of her mouth. I turned back to the screen, emotions swirling—awe, fear, hope. In that moment, the abstract idea of "our baby" transformed into something real, tangible. The room\'s dimness seemed to fold around us, making space for this silent, wordless connection. There, in the ghostly shapes on the monitor, I saw not just a child, but a future.',
#  'id': 'W18_batch2',
#  'creativity_scores': ['5', '7'],
#  'fine_grained_edits': [{'categorization': 'Awkward Word Choice and Phrasing',
#    'originalText': 'The room was dimly lit, with the soft hum of machinery filling the silence',
#    'editedText': 'The soft hum of machinery filled the room with an unfamiliar yet gentle melody.'},
#   {'categorization': 'Lack of Specificity and Detail',
#    'originalText': 'I sat beside Lila, squeezing her hand, as the technician swirled the wand over her belly',
#    'editedText': 'I sat beside Lila, squeezing her hand as she lay on the table while the technician swirled the wand over her rounded belly.'},
#   {'categorization': 'Unnecessary Exposition',
#    'originalText': "The screen flickered to life, a grainy black and white, like an ancient TV trying to find it's signal",
#    'editedText': 'The screen beside us flickered to life, a grainy black and white.'},
#   {'categorization': 'Poor Sentence Structure',
#    'originalText': "Lila's grip tightened, and I glanced at her, finding her eyes glossy, a smile tugging at the corners of her mouth",
#    'editedText': "Lila's grip tightened and upon glancing at her, I found her eyes glossy, along with a smile tugging at the corners of her mouth."},
#   {'categorization': 'Unnecessary Exposition',
#    'originalText': '—a future that felt both incredibly close and impossibly distant',
#    'editedText': ''}],
#  'data-split': 'train',
#  'preedit': 'The room was dimly lit, with the soft hum of machinery filling the silence. I sat beside Lila, squeezing her hand, as the technician swirled the wand over her belly. The screen flickered to life, a grainy black and white, like an ancient TV trying to find it\'s signal. Slowly, an image began to form; the unmistakable curve of a tiny head, the flutter of something I later learned was a heartbeat. I leaned forward, almost holding my breath, mesmerized by the play of light and shadows that composed this new life. Lila\'s grip tightened, and I glanced at her, finding her eyes glossy, a smile tugging at the corners of her mouth. I turned back to the screen, emotions swirling—awe, fear, hope. In that moment, the abstract idea of "our baby" transformed into something real, tangible. The room\'s dimness seemed to fold around us, making space for this silent, wordless connection. There, in the ghostly shapes on the monitor, I saw not just a child, but a future—a future that felt both incredibly close and impossibly distant.',
#  'editor': 'W18',
#  'creativity_pre_score': 5,
#  'creativity_post_score': 7,
#  'creativity_z_score_pre': 3.6285013258392107,
#  'creativity_z_score_post': 5.328722422859342,
#  'creativity_z_score_pre_int': 4,
#  'creativity_z_score_post_int': 5,
#  'creativity_z_score_diff': 1,
#  'editor_split': 'test'}

with open("prompts/lamp_cot_input.txt", "r") as f:
    lamp_cot_input_prompt = f.read()

with open("prompts/lamp_cot_output.txt", "r") as f:
    lamp_cot_output_prompt = f.read()

with open("prompts/lamp_detection_input.txt", "r") as f:
    lamp_detection_input_prompt = f.read()

with open("prompts/lamp_rewriting_input.txt", "r") as f:
    lamp_rewriting_input_prompt = f.read()

def prepare_cot_sample(sample):
    input_text = lamp_cot_input_prompt.replace("[[INPUT_PARAGRAPH]]", sample["preedit"])

    problematic_spans_STR = """"""
    for i, edit in enumerate(sample["fine_grained_edits"]):
        problematic_spans_STR += f"Span {i+1}: `{edit['originalText']}` (Category: `{edit['categorization']}`)\n"

    proposed_rewriting_STR = """"""
    for i, edit in enumerate(sample["fine_grained_edits"]):
        proposed_rewriting_STR += f"Span {i+1}: `{edit['originalText']}` -> `{edit['editedText']}`\n"

    output_text = lamp_cot_output_prompt.replace("[[PROBLEMATIC_SPANS]]", problematic_spans_STR).replace("[[PROPOSED_REWRITING]]", proposed_rewriting_STR).replace("[[FINAL_PARAGRAPH]]", sample["postedit"])
    return {"id": sample["id"], "input_text": input_text, "output_text": output_text}

def prepare_detection_sample(sample):
    input_text = lamp_detection_input_prompt.replace("[[INPUT_PARAGRAPH]]", sample["preedit"])
    output_text = json.dumps({"edits": [{"span": edit["originalText"], "category": edit["categorization"]} for edit in sample["fine_grained_edits"]]})
    return {"id": sample["id"], "input_text": input_text, "output_text": output_text}

def prepare_rewriting_samples(sample):
    samples = []
    for edit_idx, edit in enumerate(sample["fine_grained_edits"]):
        input_text = lamp_rewriting_input_prompt.replace("[[INPUT_PARAGRAPH]]", sample["preedit"]).replace("[[PROBLEMATIC_SPAN]]", edit["originalText"]).replace("[[PROBLEMATIC_CATEGORY]]", edit["categorization"])
        output_text = json.dumps({"rewrite": edit["editedText"]})
        samples.append({"id": f"{sample['id']}_edit_{edit_idx}", "input_text": input_text, "output_text": output_text})
    return samples

with open("data/LAMP-train-val-test.json", "r") as f:
    lamp_data = json.load(f)

lamp_train = [d for d in lamp_data if d["data-split"] == "train"]

with open("data/lamp_train_cot_format.json", "w") as f:
    json.dump([prepare_cot_sample(sample) for sample in lamp_train], f, indent=4)

with open("data/lamp_train_detection_format.json", "w") as f:
    json.dump([prepare_detection_sample(sample) for sample in lamp_train], f, indent=4)

lamp_rewriting_samples = []
for sample in lamp_train:
    lamp_rewriting_samples += prepare_rewriting_samples(sample)

with open("data/lamp_train_rewriting_format.json", "w") as f:
    json.dump(lamp_rewriting_samples, f, indent=4)

print("Created the three datasets.")
print(f"COT has {len(lamp_train)} samples.")
print(f"Detection has {len(lamp_train)} samples.")
print(f"Rewriting has {len(lamp_rewriting_samples)} samples.")

Created the three datasets.
COT has 1000 samples.
Detection has 1000 samples.
Rewriting has 7234 samples.


In [12]:
# load cot dataset
with open("data/lamp_train_cot_format.json", "r") as f:
    lamp_train_cot = json.load(f)

print(lamp_train_cot[0]["output_text"])


Part 1: Idenfying Problematic Spans

Span 1: `The room was dimly lit, with the soft hum of machinery filling the silence` (Category: `Awkward Word Choice and Phrasing`)
Span 2: `I sat beside Lila, squeezing her hand, as the technician swirled the wand over her belly` (Category: `Lack of Specificity and Detail`)
Span 3: `The screen flickered to life, a grainy black and white, like an ancient TV trying to find it's signal` (Category: `Unnecessary Exposition`)
Span 4: `Lila's grip tightened, and I glanced at her, finding her eyes glossy, a smile tugging at the corners of her mouth` (Category: `Poor Sentence Structure`)
Span 5: `—a future that felt both incredibly close and impossibly distant` (Category: `Unnecessary Exposition`)


Part 2: Proposing Rewriting for Problematic Spans

Span 1: `The room was dimly lit, with the soft hum of machinery filling the silence` -> `The soft hum of machinery filled the room with an unfamiliar yet gentle melody.`
Span 2: `I sat beside Lila, squeezing her

# Prepare Evaluation Benchmark

In [15]:
import json

# lamp_fn = "data/LAMP-train-val-test.json"

# with open(lamp_fn, "r") as f:
#     lamp_data = json.load(f)
# lamp_test = [d for d in lamp_data if d["data-split"] == "test"]
# instruction2lamp_sample = {sample['instruction']: sample for sample in lamp_test}

gold_prefs = "data/gold_preference_600.json"
with open(gold_prefs, "r") as f:
    gold_prefs = json.load(f)

eval_samples = []
for sample_id, gold_pref in enumerate(gold_prefs):
    sample_id = f"gold_{sample_id}"
    candidates = [
        {"id": f"{sample_id}_ai_draft", "system": "ai_draft", "text": gold_pref["AI-generated"]},
        {"id": f"{sample_id}_human_edited", "system": "human_edited", "text": gold_pref["Human-edited"]},
        {"id": f"{sample_id}_fs_edited", "system": "fs_edited", "text": gold_pref["AI-edited"]},
    ]
    sample = {"id": sample_id, "instruction": gold_pref["instruction"], "candidates": candidates}
    eval_samples.append(sample)


with open("data/lamp_editing_benchmark.json", "w") as f:
    json.dump(eval_samples, f, indent=4)


# Generate Additional Candidates

In [4]:
from utils import load_env_vars
from utils_diff import make_colored_text
import tqdm

load_env_vars()

from llms import generate, generate_json

## COT Model

In [5]:
cot_model_full_name, cot_model_short_name = "ft:gpt-4o-2024-08-06:salesforce-research:lamp-4o-cot:Aqlv1wPq", "lamp-4o-cot"

with open("data/lamp_editing_benchmark.json", "r") as f:
    eval_samples = json.load(f)

with open("prompts/lamp_cot_input.txt", "r") as f:
    lamp_cot_input_prompt = f.read()

candidate_key = f"edited_{cot_model_short_name}"

todos = [sample for sample in eval_samples if not any(candidate["system"] == candidate_key for candidate in sample["candidates"])]
for sample in tqdm.tqdm_notebook(todos):
    system2candidate = {candidate["system"]: candidate for candidate in sample["candidates"]}
    ai_draft = system2candidate["ai_draft"]["text"]

    sample_candidate_key = f"{sample['id']}_{candidate_key}"
    cot_response = generate([{"role": "user", "content": lamp_cot_input_prompt}], model=cot_model_full_name, variables={"INPUT_PARAGRAPH": ai_draft}, max_tokens=2000, step="lamp-editing-cot")
    # find the line that starts with "Part 3"
    part_3_idx = cot_response.find("Part 3")
    final_rewrite = cot_response[part_3_idx:]

    final_rewrite = "\n".join(final_rewrite.split("\n")[1:]).strip()
    # print(make_colored_text(ai_draft, final_rewrite))
    sample["candidates"].append({"id": sample_candidate_key, "system": candidate_key, "text": final_rewrite})

    with open("data/lamp_editing_benchmark.json", "w") as f:
        json.dump(eval_samples, f, indent=4)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sample in tqdm.tqdm_notebook(todos):


  0%|          | 0/132 [00:00<?, ?it/s]

# Run Detect and Rewriting Pipeline

In [7]:
detection_model, rewriting_model, system_name = "ft:gpt-4o-2024-08-06:salesforce-research:lamp-4o-detection:Aqna3sJ6", "ft:gpt-4o-2024-08-06:salesforce-research:lamp-4o-rewriting:AqpAqwih", "lamp-4o-dnr"

with open("prompts/lamp_detection_input.txt", "r") as f:
    lamp_detection_input_prompt = f.read()

with open("prompts/lamp_rewriting_input.txt", "r") as f:
    lamp_rewriting_input_prompt = f.read()

def run_dnr_pipeline(ai_draft, detection_model, rewriting_model):
    # 1. Detect
    detection_response = generate_json([{"role": "user", "content": lamp_detection_input_prompt}], model=detection_model, variables={"INPUT_PARAGRAPH": ai_draft}, step="lamp-editing-dnr-detection")

    edits = detection_response["edits"]
    filtered_edits = [edit for edit in edits if edit["span"] in ai_draft and ai_draft.count(edit["span"]) == 1]

    # 2. Rewrite
    for edit in filtered_edits:
        output_rewrite = generate_json([{"role": "user", "content": lamp_rewriting_input_prompt}], model=rewriting_model, variables={"INPUT_PARAGRAPH": ai_draft, "PROBLEMATIC_SPAN": edit["span"], "PROBLEMATIC_CATEGORY": edit["category"]}, step="lamp-editing-dnr")
        edit["rewrite"] = output_rewrite["rewrite"]

    # 3. Apply te edits
    edited_text = ai_draft
    for edit in filtered_edits:
        edited_text = edited_text.replace(edit["span"], edit["rewrite"])

    return edited_text, filtered_edits


with open("data/lamp_editing_benchmark.json", "r") as f:
    eval_samples = json.load(f)

todos = [sample for sample in eval_samples if not any(candidate["system"] == system_name for candidate in sample["candidates"])]

for sample in tqdm.tqdm_notebook(todos):
    system2candidate = {candidate["system"]: candidate for candidate in sample["candidates"]}
    ai_draft = system2candidate["ai_draft"]["text"]

    sample_candidate_key = f"{sample['id']}_{system_name}"
    edited_text, edits = run_dnr_pipeline(ai_draft, detection_model, rewriting_model)
    sample["candidates"].append({"id": sample_candidate_key, "system": system_name, "text": edited_text, "edits": edits})

    with open("data/lamp_editing_benchmark.json", "w") as f:
        json.dump(eval_samples, f, indent=4)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sample in tqdm.tqdm_notebook(todos):


  0%|          | 0/29 [00:00<?, ?it/s]

# Run Reward-based Scoring

In [9]:
from utils import load_env_vars

load_env_vars()

from llms import generate_json
import tqdm, json

with open("prompts/reward_calc.txt", "r") as f:
    reward_calc_prompt = f.read()

REWARD_MODEL = "ft:gpt-4o-2024-08-06:salesforce-research:lamp-4o-p:AYKM53Ac"

def calculate_reward(candidate):
    output_response = generate_json([{"role": "user", "content": reward_calc_prompt}], model=REWARD_MODEL, variables={"PARAGRAPH": candidate["text"]}, step="lamp-editing-r-eval")
    candidate["score"] = output_response["score"]
    print(candidate["id"], candidate["score"])

with open("data/lamp_editing_benchmark.json", "r") as f:
    eval_samples = json.load(f)

todos = [candidate for sample in eval_samples for candidate in sample["candidates"] if "score" not in candidate]
for i, todo in enumerate(tqdm.tqdm_notebook(todos)):
    calculate_reward(todo)

    if i%10 == 0 or i == len(todos)-1:
        with open("data/lamp_editing_benchmark.json", "w") as f:
            json.dump(eval_samples, f, indent=4)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, todo in enumerate(tqdm.tqdm_notebook(todos)):


  0%|          | 0/613 [00:00<?, ?it/s]

gold_70_lamp-4o-dnr 6
gold_70_edited_lamp-4o-cot 7
gold_71_lamp-4o-dnr 5
gold_71_edited_lamp-4o-cot 6
gold_72_lamp-4o-dnr 6
gold_72_edited_lamp-4o-cot 6
gold_73_edited_lamp-4o-cot 6
gold_73_lamp-4o-dnr 7
gold_74_edited_lamp-4o-cot 7
gold_74_lamp-4o-dnr 7
gold_75_edited_lamp-4o-cot 6
gold_75_lamp-4o-dnr 7
gold_76_edited_lamp-4o-cot 6
gold_76_lamp-4o-dnr 6
gold_77_edited_lamp-4o-cot 7
gold_77_lamp-4o-dnr 6
gold_78_edited_lamp-4o-cot 8
gold_78_lamp-4o-dnr 5
gold_79_edited_lamp-4o-cot 7
gold_79_lamp-4o-dnr 7
gold_80_edited_lamp-4o-cot 5
gold_80_lamp-4o-dnr 4
gold_81_edited_lamp-4o-cot 5
gold_81_lamp-4o-dnr 6
gold_82_edited_lamp-4o-cot 7
gold_82_lamp-4o-dnr 6
gold_83_edited_lamp-4o-cot 6
gold_83_lamp-4o-dnr 6
gold_84_ai_draft 3
gold_84_human_edited 6
gold_84_fs_edited 5
gold_84_edited_lamp-4o-cot 5
gold_84_lamp-4o-dnr 4
gold_85_ai_draft 6
gold_85_human_edited 8
gold_85_fs_edited 9
gold_85_edited_lamp-4o-cot 8
gold_85_lamp-4o-dnr 7
gold_86_ai_draft 7
gold_86_human_edited 8
gold_86_fs_edited 

# Print Results


In [2]:
import pandas as pd, json, numpy as np
from collections import defaultdict

with open("data/lamp_editing_benchmark.json", "r") as f:
    eval_samples = json.load(f)

candidates_scored = [candidate for sample in eval_samples for candidate in sample["candidates"] if "score" in candidate]

system_scores = defaultdict(list)
for candidate in candidates_scored:
    system_scores[candidate["system"]].append(candidate["score"])

results = [{"system": system, "N": len(scores), "score": np.mean(scores)} for system, scores in system_scores.items()]

pd.DataFrame(results).sort_values(by="score", ascending=False).round(3)

Unnamed: 0,system,N,score
1,human_edited,84,6.964
3,edited_lamp-4o-cot,69,6.362
4,lamp-4o-dnr,70,6.186
2,fs_edited,84,5.929
0,ai_draft,84,5.619
