In [None]:
import json

def clean_text(text):
    return text.replace("\\'", "'").replace('\\"', '"').replace("\\n", "\n").strip()

def clean_all_text_fields(entry):
    entry["prompt"] = clean_text(entry.get("prompt", ""))
    for key in ["directive", "neutral", "nightline_aligned"]:
        if key in entry.get("responses", {}):
            response_text = entry["responses"][key].get("text", "")
            entry["responses"][key]["text"] = clean_text(response_text)
    return entry

# Load raw cleaned dataset (upload manually or use path if already in Colab)
input_file = "nightline_cleaned_final(1).jsonl"
with open(input_file, "r", encoding="utf-8") as f:
    raw_entries = [json.loads(line.strip()) for line in f]

# Clean the dataset
cleaned_full_dataset = [clean_all_text_fields(entry) for entry in raw_entries]

# Save CLEANED FINAL TOTAL NIGHTLINE DATA
with open("CLEANED_FINAL_TOTAL_NIGHTLINE_DATA.jsonl", "w", encoding="utf-8") as f:
    for item in cleaned_full_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# Extract and save CLEANED FINAL INSTRUCTION DATA
instruction_data = [
    {
        "prompt": entry["prompt"],
        "response": entry["responses"]["nightline_aligned"]["text"]
    }
    for entry in cleaned_full_dataset
    if "nightline_aligned" in entry["responses"]
]

with open("CLEANED_FINAL_INSTRUCTION_DATA.jsonl", "w", encoding="utf-8") as f:
    for item in instruction_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# Extract and save CLEANED FINAL DPO DATA
dpo_data = []
for entry in cleaned_full_dataset:
    prompt = entry["prompt"]
    aligned = entry["responses"].get("nightline_aligned", {}).get("text", "")
    directive = entry["responses"].get("directive", {}).get("text", "")
    neutral = entry["responses"].get("neutral", {}).get("text", "")

    if prompt and aligned and directive:
        dpo_data.append({"prompt": prompt, "chosen": aligned, "rejected": directive})
    if prompt and aligned and neutral:
        dpo_data.append({"prompt": prompt, "chosen": aligned, "rejected": neutral})

with open("CLEANED_FINAL_DPO_DATA.jsonl", "w", encoding="utf-8") as f:
    for item in dpo_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("✅ Done cleaning and exporting:")
print(f"• CLEANED_FINAL_TOTAL_NIGHTLINE_DATA.jsonl → {len(cleaned_full_dataset)} entries")
print(f"• CLEANED_FINAL_INSTRUCTION_DATA.jsonl → {len(instruction_data)} entries")
print(f"• CLEANED_FINAL_DPO_DATA.jsonl → {len(dpo_data)} pairs")
