# Step 1. Get the LLM responses for the correction request

In [1]:
from openai import AzureOpenAI
import os
from dotenv import load_dotenv
import json
import pandas as pd
import time
import tiktoken

### A first look at the texts to be corrected

In [2]:
df = pd.read_parquet("../data/cleaned-latam-xix.parquet")
print(df.loc[0, "text"])

cualquier cosa, pues solo se hizo circular en hoja suelta: pero lo haremos conocer próximamente, AREQUIPA MARZO 5 DB 1884. pues tenemos el propósito de no omitir : esfuerzo hasta conseguirla. Mientras tanto, como reminiscencias que de ella se conservan en la memoria, insertamos uno de sus fragmentos, que poco mas ó menos dice a "Casas sin techo, Rio sin agua, Arboles sin hojas, Muchacho malcriado ...... Todo esto era el Perú A la muerte del General Castilla.“ Si hay alguna alteracion en esta parte, el autor por interes propio debe exhibir la Oda para rectificarla, previniendose que aun cuando parezca exagerado, este fragmento es de los menos malos. Y a proposito. ¿ Que daño pudo haberle hecho a este pedante el ilustre General Castilia, que con tanto desden miraba a los pequeños, para que intentase escarnecer y poner en rídiculo su respetada memoria? Nos esplicamos el motivo del encono que abriga el Redactor de "El Peru" para el país de su nacimiento, pero .... la saña que revela no sol

## 1. Prepare the LLM API Client and request parameters

In this case, the LLM is GPT in it's **GPT 4o** version

In [3]:
def compute_price(model, in_tk, out_tk):
    prices_per_1k = {
        "gpt-4o": {"input": 0.005, "output": 0.015},
        "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
    }

    if model not in prices_per_1k:
        raise ValueError(f"Model {model} not found in the pricing list")

    input_cost = (in_tk / 1000) * prices_per_1k[model]["input"]
    output_cost = (out_tk / 1000) * prices_per_1k[model]["output"]

    return input_cost + output_cost

In [4]:
load_dotenv('./.env')
client = AzureOpenAI(
    api_version=os.getenv("AZURE_OPENAI_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv('AZURE_OPENAI_API_KEY')
)
engine = os.getenv("AZURE_OPENAI_IMPLEMENTATION")
model = os.getenv("MODEL")
encoder = tiktoken.encoding_for_model(model)
count_tk = lambda t: len(encoder.encode(t))
MAX_RETRIES = 2

In [5]:
def request(prompt, max_tokens=4096, temperature=0, try_count=0):
    """Request a completion from the OpenAI API.
    :param prompt: The prompt to send to the API
    :param max_tokens: The maximum number of tokens in the output
    :param temperature: The degree of randomness in the output
    :param try_count: Retry control parameter
    :return response: The response from the API
    :return usage: The count of token usage from the API { "input", "output" }
    """
    expected_tokens = count_tk(prompt)
    if expected_tokens > 4000:
        return "", {"input": 0, "output": 0}, f"length - INPUT too long ({expected_tokens} tokens)"
    try:
        response = client.chat.completions.create(
            model=engine,
            max_tokens=max_tokens,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}],
        )
        finish_reason = response.choices[0].finish_reason
        if finish_reason == "content_filter":
            rsp = ""
            content_filter = response.choices[0].content_filter_results
            prompt_filter = response.prompt_filter_results[0]['content_filter_results']
            finish_reason += f" - {get_filter_flags(content_filter, prompt_filter)}"
        else:
            rsp = response.choices[0].message.content
        return rsp, { "input": response.usage.prompt_tokens, "output": response.usage.completion_tokens }, finish_reason
    except Exception as e:
        response = ""
        try: usage = { "input": response.usage.prompt_tokens, "output": response.usage.completion_tokens }
        except: usage = { "input": 0, "output": 0 }
        if "content management policy" in f"{e}":
            #finish_reason = "content_filter - content_management_policy"
            finish_reason = f"ERROR [{type(e).__name__}]: {e}"
        else:
            if "Max retries exceeded with url" in f"{e}":
                if try_count >= MAX_RETRIES:
                    return response, usage, f"RETRYING, but reached MAX_RETRIES ({MAX_RETRIES})"
                print(f"RETRYING ({try_count})...")
                time.sleep(60)
                return request(prompt, max_tokens, temperature, try_count+1) # retry after 60 seconds
            finish_reason = f"ERROR [{type(e).__name__}]: {e}"
        return response, usage, finish_reason

def get_filter_flags(content_filter, prompt_filter):
    return ', '.join(
        [f"content.{k}" for k, v in content_filter.items() if v != {'filtered': False, 'severity': 'safe'}] +
        [f"prompt.{k}" for k, v in prompt_filter.items() if (k == 'jailbreak' and v != {'filtered': False, 'detected': False}) or (k != 'jailbreak' and v != {'filtered': False, 'severity': 'safe'})]
    )

Define the prompt to send with the text:

In [6]:
prompt_start = 'Dado el texto del siglo xix entre ```, retorna únicamente el texto corrigiendo los errores ortográficos sin cambiar la gramática. No corrijas ortografía de nombres:\n```\n'
# text
prompt_end = '\n```'
gen_prompt = lambda text: f"{prompt_start}{text}{prompt_end}"

To recover the information from the last execution and avoid the lost of data if an error occurs:

In [7]:
RESPONSES_FILE = "./responsesLatam.json"

r = {"data":[], "checkpoint": 0, "input_tokens": 0, "output_tokens": 0, "fail_input_tokens": 0, "fail_output_tokens": 0, "total_price": 0, "price_per_req": 0}
if os.path.exists(RESPONSES_FILE):
    with open(RESPONSES_FILE, "r") as f:
        r = json.load(f)
else:
    with open(RESPONSES_FILE, "w") as f:
        f.write(json.dumps(r, indent=4))

In [8]:
assert r['checkpoint'] == len(r['data']), "Checkpoint does not match with corrected texts"
print(f"Done {r['checkpoint']}/{len(df)} ({100*r['checkpoint']/len(df):.2f}%)")

Done 51860/64077 (80.93%)


## 2. Send the requests to the API and store them periodically

In [None]:
for text in df.loc[r['checkpoint']:, "text"]:
    prompt = gen_prompt(text)
    #print(f"------------------------------- {r['checkpoint']+1} -----------------------------------")
    #print(prompt)
    response, usage, finish_reason = request(prompt)
    #print(response if response else f"ERROR: {finish_reason}")
    if finish_reason in ['stop', 'length']:
        r["input_tokens"] += usage["input"]
        r["output_tokens"] += usage["output"]
    else:
        r["fail_input_tokens"] += usage["input"]
        r["fail_output_tokens"] += usage["output"]

    r["data"].append({
        "text": text,
        "resp": response,
        "finish_reason": finish_reason
    })

    r["checkpoint"] += 1

    if r['checkpoint'] % 10 == 0:
        price = compute_price(model, r['input_tokens']+r["fail_input_tokens"], r['output_tokens']+r["fail_output_tokens"])
        r['total_price'] = round(price, 4)
        r['price_per_req'] = round(price / r['checkpoint'], 6)
        with open(RESPONSES_FILE, "w") as f:
            f.write(json.dumps(r, indent=4))
        print(f"SAVED ({r['checkpoint']})")

with open(RESPONSES_FILE, "w") as f:
    f.write(json.dumps(r, indent=4))

SAVED (51870)
SAVED (51880)


There will be empty responses due to errors when sending the request. Some of them are due to connection issues, and others due to the OpenAI's content management policy. In this case, the request, can be run manually when there are only a few cases.

In [None]:
print("The following requests must be repeated manually (due to OpenAI's content filter or an error):\n")
for i,e in enumerate(r["data"]):
    if e["finish_reason"].startswith("content_filter") or e["finish_reason"].startswith("ERROR"):
        print(f"------------------------------- {i} -----------------------------------")
        print(gen_prompt(e['text']))

# put the responses in the dictionary and run the next cell

In [None]:
manually_req_responses = {
    # ...
}

for i,e in enumerate(r["data"]):
    if e["finish_reason"] == "content_filter" or e["finish_reason"].startswith("ERROR"):
        print(f"------------------------------- {i} -----------------------------------")
        print(r["data"][i]['text'])
        if i in manually_req_responses:
            r["data"][i]["resp"] = manually_req_responses[i]
            r["data"][i]["finish_reason"] = "stop"
            print(r["data"][i]["resp"])
            print(f"Index {i} OK")
        else:
            print(f"Index {i} NOT FOUND")

In [None]:
with open(RESPONSES_FILE, "w") as f:
    f.write(json.dumps(r, indent=4))