This file: Generate JSONL Input file for Batch API Use, compiling raw text with prompt

Input:
- 0_investment_sentences.csv: sentences related to investment

Output:
- 1_batch_api_input.jsonl: jsonl input file

In [5]:
import pandas as pd
import json
import os
from tqdm import tqdm
import tiktoken

Notice: If you're using colab, run the following two cells

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/acct4_ta_s1')

# === Step 1: Load Data and Output Path ===

In [6]:
ccall_df = pd.read_csv("0_investment_sentences.csv")  # 'id', 'cluster'
ccall_df = ccall_df[:100]

# Create output directory
os.makedirs("1_batch_api_input", exist_ok=True)
batch_api_file = "1_batch_api_input.jsonl"

# === Step 2: Write JSONL INPUT FILE ===

In [7]:
with open(batch_api_file, 'w', encoding='utf-8') as f:
    for id in tqdm(ccall_df['sentence_id'].unique()):
        row = ccall_df[ccall_df['sentence_id'] == id]
        statement = row['sentence'].values[0] if not row.empty else ""

#####################################################
######  PROMPT
        prompt = f"""
Identify if the following statement contains a investment plan to be executed. If yes, identify the investment target and amount.
Criteria: 1. Exclude the investment that has been executed.
Satetement: {statement}
Rules: Answer using JSON in the following format: {{\"id\": {id}, \"invest_plan\" : 0 or 1, \"invest_target\" : <invest target>, \"invest_amount\" : <dollar amount>}}.
"""
#####################################################
        job = {
            "custom_id": f"{id}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": [
                    {"role": "system", "content": "You are a research assistant who follows instructions strictly."},
                    {"role": "user", "content": prompt}
                ],
                "response_format": {"type": "json_object"},
                "temperature": 0
            }
        }
        f.write(json.dumps(job, ensure_ascii=False) + '\n')

print(f"Batch API JSONL file prepared: {batch_api_file}")

100%|██████████| 100/100 [00:00<00:00, 1765.87it/s]

Batch API JSONL file prepared: 1_batch_api_input.jsonl





# === Step 3: Compute total tokens ===

In [8]:
total_jobs = 0
total_tokens = 0
enc = tiktoken.encoding_for_model("gpt-4o")

with open(batch_api_file, 'r', encoding='utf-8') as f:
    for line in f:
        job = json.loads(line)
        total_jobs += 1
        tokens = sum(len(enc.encode(message["content"])) for message in job["body"]["messages"])
        total_tokens += tokens
print(f"Total jobs: {total_jobs}")
print(f"Total tokens: {total_tokens}")
print("\nEstimated costs:")
estimated_input_cost = total_tokens * 1.25 / 1_000_000
print(f"Input cost: ${estimated_input_cost:.4f}")

Total jobs: 100
Total tokens: 13867

Estimated costs:
Input cost: $0.0173
