In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from utils.ds_utils import load_dataset_from_huggingface
ds = load_dataset_from_huggingface()

In [None]:
ds = ds.filter(lambda example: len("".join(example["text"]).split()) > 250)

In [None]:
df = ds.select_columns(['id','text']).to_pandas()


In [None]:
import json

def create_sample_file(word_count, df, output_file):
    tasks = []
    for index, row in df.iterrows():
        article = row['text']
        task = {
            "custom_id": row['id'],  # custom_id must be a string
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o-mini",
                "temperature": 0.1,
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a helpful assistant tasked with creating summaries of user given text. These summaries preserve the tone, voice and perspective of the original text."
                    },
                    {
                        "role": "user",
                        "content": f"""Generate a list of at most {word_count} topics 
                                        for the following article. Keep the authorial voice, 
                                        perspective, and tone. \n\n\n {article}"""
                    }
                ],
            }
        }
        tasks.append(task)

    for i in range(0,len(tasks), 200):
        with open(f"samples/{output_file}_{i}.jsonl", 'w') as file:
            for j in range(i, min(len(tasks), i + 200)):
                file.write(json.dumps(tasks[j]) + '\n')


In [None]:
from openai import OpenAI
import os
import time

client = OpenAI(api_key=os.getenv("openai"))

def upload(file):
    # with open("test.jsonl", "w") as f:
    #     for elt in lst:
    #         f.write(f"{json.dumps(elt)}\n")

    # Upload to OpenAI file API
    batch_file = client.files.create(
        file=open(file, "rb"),
        purpose="batch"
    )

    # Start batch job
    batch_job = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/responses", #"/v1/chat/completions",
        completion_window="24h"
    )
    
    # with open("batch_ids.csv", "a") as f:
    #     f.write(f"{batch_job.id}\n")

    time.sleep(10)
    retrieved = client.batches.retrieve(batch_id=batch_job.id)
    print(retrieved.status)

    if retrieved.status != 'failed':
        with open("uploaded.csv", "a") as f:
            f.write(f"{file},{batch_file.id},{batch_job.id}\n")
        return True
    else:
        print(retrieved)
        return False
    
    # if status == 'failed':
    #     return False

    # return True


In [None]:
def create_samples(df, word_count):
    tasks = []
    for index, row in df.iterrows():
        id = row['id']
        article = row['text']
        task = {
            "custom_id": f"{id}_{word_count}",  # custom_id must be a string
            "method": "POST",
            "url": "/v1/responses",
            "body": {
                "model": "gpt-4o-mini",
                "input" : f"Generate an exactly {word_count} word summary of the following article. Keep the authorial voice, perspective, and tone. Write from the perspective of the original article, not as a third party summarizing the article. \n {article}",
                "text": {
                "format" : {
                    "name": "response_type",
                    "schema": {
                    "$schema": "https://json-schema.org/draft/2020-12/schema",
                    "type": "object",
                    "properties": {
                        "article": {
                        "type": "string"
                        }
                    },
                    "required": ["article"],
                    "additionalProperties": False
                    },
                    "type": "json_schema" 
                    }
                }
            }
        }
        tasks.append(task)


    for i in range(0,len(tasks), 200):
        with open(f"samples/{word_count}_{i}.jsonl", 'w') as f:
            for j in range(i, min(len(tasks), i + 200)):
                f.write(f"{json.dumps(tasks[j])}\n")

In [None]:
uploaded = []
processed_batches = []

In [None]:
import glob
import time

files_to_see = True

while files_to_see:
    files = glob.glob("samples/*.jsonl")
    files.sort()

    files = [x for x in files if x not in uploaded]

    if len(files) < 1:
        files_to_see = False
        break

    for f in files[:5]:
        success = upload(f)
        if success:
            uploaded.append(f)

    time.sleep(60*60)

    lst = client.batches.list()
    completed = []
    for elt in lst.data:
        if elt.status == "completed":
            completed.append(elt.id)

    new_batches = [x for x in completed if x not in processed_batches]
    for batch in new_batches:
        batch_job = client.batches.retrieve(batch)
        result_file_id = batch_job.output_file_id
        results_list = []
        result = client.files.content(result_file_id).content
        result = result.decode('utf-8')
        result_entries = result.strip().split("\n")
        for r in result_entries:
            results_list.append(json.loads(r))

        custom_id = results_list[0]['custom_id'].split("_")
        if len(custom_id) == 2 and custom_id[1] == '5':
            for item in results_list:
                idx = item['custom_id'].split("_")[0]
                summary = item['response']['body']['output'][0]['content'][0]['text']
                with open("summary_5.psv", "a") as f:
                    f.write(f"{idx}|{json.loads(summary)['article']}\n")

        processed_batches.append(batch)
