In [1]:
from dotenv import load_dotenv
import os
%load_ext autoreload
%autoreload 2

load_dotenv()  # loads .env from current directory

True

In [3]:
from huggingface_hub import login
login(os.getenv("hf_token"))

In [4]:
from datasets import load_dataset

ds = load_dataset("dragonslayer631/allsides")

In [7]:
ds = ds["train"].filter(lambda example: len("".join(example["text"]).split()) > 250)

Filter: 100%|██████████| 47876/47876 [00:06<00:00, 7433.03 examples/s]


In [None]:
import json
import ast
import re
def fix_malformed_json(text: str) -> str:
    # Replace single quotes around keys with double quotes
    text = re.sub(r"'(\w+)'(?=\s*:)", r'"\1"', text)
    
    # Replace smart quotes with standard quotes
    text = text.replace("“", '"').replace("”", '"')
    
    # Ensure keys inside nested structures also get fixed (not just the top-level)
    text = re.sub(r"'([^']*)'", lambda m: f'"{m.group(1)}"' if ':' not in m.group(1) else m.group(0), text)
    
    # Optionally validate or pretty-print
    try:
        parsed = json.loads(text)
        return json.dumps(parsed, indent=2)
    except json.JSONDecodeError as e:
        return f"Error: Could not parse fixed JSON. {e}"


def concat_list_string(s):
    try:

        # Convert string to actual list
        decoded_list = ast.literal_eval(s)

        # Join all elements into one string
        joined_string = ' '.join(decoded_list)
        return fix_malformed_json(joined_string)
    except Exception:
        pass
    return s

In [None]:
def create_chatgpt_summary(word_count, instruction, example):
  article = concat_list_string(example["text"])
  request = { "model": "gpt-4o-mini",
    "store": True,
    "input": f"Generate an exactly {word_count} word summary of the following article. {instruction}. \n {article}",
    "text" : {
      "format" : {
        "name": "response_type",
        "schema": {
          "$schema": "https://json-schema.org/draft/2020-12/schema",
          "type": "object",
          "properties": {
            "article": {
              "type": "string"
            }
          },
          "required": ["article"],
          "additionalProperties": False
        },
        "type": "json_schema" 
        }
      },
    }
  return request
  

In [96]:
def create_chatgpt_tags(tag_count, instruction, example):
    article = concat_list_string(example["text"])
    request = { "model": "gpt-4o-mini",
        "store": True,
        "input": f"Generate a list of at most {tag_count} topics for the following article. {instruction}. \n {article}",
        "text" : {
        "format" : {
            "name": "response_type",
            "schema": {
                "$schema": "https://json-schema.org/draft/2020-12/schema",
                "type": "object",
                "properties": {
                    "response": {
                    "type": "array",
                    "items": {
                        "type": "string"
                    }
                    }
                },
                "required": ["response"],
                "additionalProperties": False
            },
            "type": "json_schema" 
            }
        },
        }
    return request

In [60]:
create_chatgpt_summary(100, "test", ds[0])

{'model': 'gpt-4o-mini',
 'store': True,
 'input': 'Generate an exactly 100 word summary of the following article. test. \n When Rep. Ilhan Omar ( D-MN ) tweeted about “ art and architecture ” during the Notre Dame fire , I defended her against the criticism it drew from Christians who accused her of pointedly ignoring the spiritual significance of the building . I thought the tweet was perfectly normal and acceptable , and I don ’ t really expect a Muslim woman to find any spiritual significance in a building meant for Christian worship . To my mind , that outrage — if we can call it an outrage — was overblown .\nBut “ Easter worshippers ” is a different thing entirely . Responding to the slaughter of hundreds of Christians during Easter attacks by radical Islamic terrorists , a number of prominent Democratic politicians chose to issue statements that glaringly omit any direct mention of the faith identity of the victims .\nBarack Obama said this : “ The attacks on tourists and Easter

In [84]:
def create_sample(example, word_count, tags=False):
    if tags:
        body = create_chatgpt_tags(word_count, "Each topic must be at most 1-2 words long", example)
    else:
        body = create_chatgpt_summary(word_count, "Keep the authorial voice, perspective, and tone", example)
    request = {
        "custom_id": f"{example['id']}_{word_count}" if not tags else f"{example['id']}_tags",
        "method": "POST",
        "url": "/v1/responses",
        "body": body
    }
    return request

In [85]:
ds_samples_100 = ds.map(lambda ex: create_sample(example=ex, word_count=100, tags=False))

Map: 100%|██████████| 45215/45215 [00:08<00:00, 5621.73 examples/s]


In [86]:
ds_samples_100.select_columns(['custom_id', 'method', 'url', 'body']).select(range(10)).to_json("allsides_summary_100.jsonl")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 360.09ba/s]


80368

In [87]:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("openai"))

batch_input_file = client.files.create(
    file=open("allsides_summary_100.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-34pLwo2isj5rUmMUoMvB34', bytes=80368, created_at=1744573667, filename='allsides_summary_100.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


In [88]:
batch_input_file_id = batch_input_file.id
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/responses",
    completion_window="24h",
    metadata={
        "description": "create 100 word summaries"
    }
)

Batch(id='batch_67fc14e5fb0c8190abb3b71e27bc0093', completion_window='24h', created_at=1744573669, endpoint='/v1/responses', input_file_id='file-34pLwo2isj5rUmMUoMvB34', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1744660069, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'create 100 word summaries'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [89]:
batch = client.batches.retrieve(batch_id='batch_67fc0e1b74f481909a51a22732be077e')
print(batch)

Batch(id='batch_67fc0e1b74f481909a51a22732be077e', completion_window='24h', created_at=1744571931, endpoint='/v1/responses', input_file_id='file-AribzymYegBKiFzKt5DPxE', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='invalid_type', line=1, message="Invalid type for 'body': expected an object, but got a string instead.", param='body'), BatchError(code='invalid_type', line=2, message="Invalid type for 'body': expected an object, but got a string instead.", param='body'), BatchError(code='invalid_type', line=3, message="Invalid type for 'body': expected an object, but got a string instead.", param='body'), BatchError(code='invalid_type', line=4, message="Invalid type for 'body': expected an object, but got a string instead.", param='body'), BatchError(code='invalid_type', line=5, message="Invalid type for 'body': expected an object, but got a string instead.", param='body'), BatchError(cod

In [91]:
ds2 = load_dataset("json", data_files="allsides_summary_100.jsonl")

In [95]:
ds2["train"][2]

{'custom_id': '404bfca6572e65b2_100',
 'method': 'POST',
 'url': '/v1/responses',
 'body': {'input': 'Generate an exactly 100 word summary of the following article. Keep the authorial voice, perspective, and tone. \n Trump Administration Advertisement Supported by Iran’s supreme leader, Ayatollah Ali Khamenei, promised retaliation. The U.S. moved to send more troops to the Middle East. And a deluge of threats on social media. transcript Suleimani was plotting imminent and sinister attacks on American diplomats and military personnel. But we caught him in the act. We took action last night to stop a war. We did not take action to start a war. President Trump said Friday afternoon that the airstrike that killed Maj. Gen. Qassim Suleimani, the powerful Iranian commander, was ordered “to stop a war” and prevented attacks on Americans. “Suleimani was plotting imminent and sinister attacks on American diplomats and military personnel, but we caught him in the act and terminated him,” he said