In [1]:
%%capture
%pip install openai

In [2]:
import  os
from    dotenv import load_dotenv
from    openai import OpenAI
import  pandas as pd
import  json
import  numpy as np
# Set up OpenAI API key
client = OpenAI(
    api_key=os.environ.get("API_KEY")
)

### Prepare `.jsonl` files

In [3]:
samples = pd.read_csv('./data/test.csv', encoding='utf-8')
samples = samples["abstract"]

parts = np.array_split(samples, 2)

print(parts[0].shape)

(2851,)


  return bound(*args, **kwds)


In [4]:
instruction = "Write a short, formal and structured title for this scientific research work, return ONLY the title: "

def write_to_jsonl(samples, partname):
    with open(f"./batches/batch_input_{partname}.jsonl", "w") as f:
        for i, abstract in enumerate(samples):
            prompt = f"{instruction}\n\n{abstract}"
            entry = {
                "custom_id": f"sample_{i}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4.1-mini",
                    "messages": [{"role": "user", "content": prompt}],
                    "temperature": 1.0,
                    "max_tokens": 48
                }
            }
            f.write(json.dumps(entry) + "\n")

### Upload file via API

In [5]:
def upload(filename):
    batch_input_file = client.files.create(
        file=open("./batches/" + filename, "rb"),
        purpose="batch"
    )

    return batch_input_file.id

### Submit the batch job

In [6]:
def submit_batch_job(file_id):
    batch_job = client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )
    batch_job = batch_job.__dict__
    return batch_job['id']

### Monitor status 

In [7]:
def fetch_status(batch_job_id):
        batch = client.batches.retrieve(batch_job_id).__dict__
        status  = batch['status']
        if status == "completed":
            output_file_id = batch['output_file_id']
            print(f"Batch job ID: {batch_job_id}, Output file ID: {output_file_id}")
            return output_file_id, True
        elif status == "failed":
            raise Exception(f"[{batch_job_id}]",batch["errors"])
        else:
            return None, False

### Retrieve result

In [8]:
def retrieve_result(output_file_id):
    if output_file_id is None:
        print("output _file_id was null")
        return
    # Create the output directory if it doesn't exist
    os.makedirs("./output/", exist_ok=True)
    file_response = client.files.content(output_file_id)
    filename = f"./output/{output_file_id}.jsonl"
    with open(filename, "w")  as f:
        f.write(file_response.text)
    return filename

In [9]:

def to_df(filename):
    record = []
    with open(filename, "r") as file:
        for line in file:
            line = json.loads(line)
            record.append(line["response"]['body']["choices"][0]["message"]['content'].strip("\""))
    return pd.DataFrame(record, columns=['abstract'])

In [10]:

def run(part, partname):
    write_to_jsonl(part, partname)
    print(f"[{partname}] File written to .jsonl")

    file_id = upload(f"batch_input_{partname}.jsonl")
    print(f"[{partname}] File uploaded, file_id: {file_id}")

    bj_id = submit_batch_job(file_id)
    print(f"[{partname}] Batch job submitted, batch_job_id: {bj_id}")

    while True:
        output_file_id, status = fetch_status(bj_id)
        if status:
            break
    print(f"[{partname}] Processing completed, output_file_id: {output_file_id}")

    filename = retrieve_result(output_file_id)
    print("Saved output to", filename)

    return to_df(filename)

In [None]:
df = pd.DataFrame()
for i,part in enumerate(parts):
    df = pd.concat([df, run(part,i)], ignore_index=True)

[0] File written to .jsonl
[0] File uploaded, file_id: file-9pAJVGCd5bhauCEvWxWC5U
[0] Batch job submitted, batch_job_id: batch_67ffbe8926208190867cfee9f45d5539
Batch job ID: batch_67ffbe8926208190867cfee9f45d5539, Output file ID: file-6WGi2mLG1jkyWXyHXA7hGB
[0] Processing completed, output_file_id: file-6WGi2mLG1jkyWXyHXA7hGB
Saved output to ./output/file-6WGi2mLG1jkyWXyHXA7hGB.jsonl
[1] File written to .jsonl
[1] File uploaded, file_id: file-Qa2NQJ2hGV6bKVESm5vDb1
[1] Batch job submitted, batch_job_id: batch_67ffcc63ad9c8190b9f7888cbeab323a
Batch job ID: batch_67ffcc63ad9c8190b9f7888cbeab323a, Output file ID: file-NqqasA1b2JiJzbSnmcFXLz
[1] Processing completed, output_file_id: file-NqqasA1b2JiJzbSnmcFXLz
Saved output to ./output/file-NqqasA1b2JiJzbSnmcFXLz.jsonl


In [14]:
df.rename(columns={"abstract": "title"}, inplace=True)
df.to_csv("./output/gpt-4o-mini-output.csv",index=False)