In [22]:
import dotenv
dotenv.load_dotenv()

from openai import AsyncOpenAI
client = AsyncOpenAI()
import os
import json

In [23]:
# Load all texts from text folder
texts = []
for file in os.listdir("text"):
    with open(os.path.join("text", file), "r") as f:
        texts.append(f.read())

In [24]:
system_message = """
I have a raw transcript of a 911 call with some words and sentences mistranscribed. I want to turn this into an OpenAI-compatible messages JSON array so that I can use this to fine tune an LLM on being a 911 operator. Clean up the transcript slightly, rewriting (but not pmmitting) lines that make sense for the operator and caller to say. Use the "assistant" role for operator and "user" role for caller. Give me the messages array.

The messages array should be formatted like this:

{
    "messages": [
        {
            "role": "assistant",
            "content": "9-1-1, what's your emergency?"
        },
        {
            "role": "user",
            "content": "<caller message>"
        },
        ...
    ]
}

The output should always start with "9-1-1, what's your emergency?".

The user will input a transcript where each new line may have been said by the operator or the caller.
"""
async def process_text(text):
    result = await client.chat.completions.create(
        model="gpt-4o",
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": text}
        ]
    )
    return result.choices[0].message.content


In [25]:
import asyncio
from tqdm import tqdm
import json
import os
processed_texts_saved = []
async def process_batch(batch):
    tasks = [process_text(text) for text in batch]
    return await asyncio.gather(*tasks)

async def process_all_texts():
    global processed_texts_saved
    batch_size = 100
    delay = 30  # seconds
    processed_texts = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        results = await process_batch(batch)
        
        # Store processed results
        processed_texts.extend(results)
        
        # Save processed results to a file after each batch
        with open('processed_texts.json', 'w') as f:
            json.dump(processed_texts, f)
        
        if i + batch_size < len(texts):
            print(f"Waiting {delay} seconds before next batch...")
            await asyncio.sleep(delay)
            
    processed_texts_saved = processed_texts

    # After all batches are processed, save to individual files
    if not os.path.exists('processed'):
        os.makedirs('processed')
    
    for idx, processed_text in enumerate(processed_texts):
        try:
            with open(f'processed/text_{idx}.json', 'w') as f:
                json.dump(json.loads(processed_text), f, indent=2)
        except Exception as e:
            print(f"Error processing text {idx}: {e}")

# Run the async function
await process_all_texts()


  0%|          | 0/6 [00:00<?, ?it/s]

Waiting 30 seconds before next batch...


 17%|█▋        | 1/6 [01:59<09:58, 119.66s/it]

Waiting 30 seconds before next batch...


 33%|███▎      | 2/6 [03:25<06:39, 99.94s/it] 

Waiting 30 seconds before next batch...


 50%|█████     | 3/6 [04:54<04:44, 94.93s/it]

Waiting 30 seconds before next batch...


 67%|██████▋   | 4/6 [06:20<03:02, 91.16s/it]

Waiting 30 seconds before next batch...


100%|██████████| 6/6 [08:26<00:00, 84.42s/it]


Error processing text 64: Expecting property name enclosed in double quotes: line 553 column 1 (char 18102)
Error processing text 88: Expecting property name enclosed in double quotes: line 588 column 12 (char 18163)
Error processing text 143: Expecting property name enclosed in double quotes: line 461 column 1 (char 18456)
Error processing text 179: Unterminated string starting at: line 669 column 24 (char 18245)
Error processing text 282: Unterminated string starting at: line 585 column 24 (char 18173)
Error processing text 507: Unterminated string starting at: line 589 column 24 (char 18274)
