# Uploading Batch Input File
 - batch file 보낼 때 너무 많이 보내지 말고 구분해서 보내기

In [1]:
from getpass import getpass

openai_api_key = getpass("OPENAI_API_KEY")

In [2]:
from openai import OpenAI
import json
import pandas as pd

client = OpenAI(api_key=openai_api_key)

In [20]:
batch_input_file = client.files.create(
    file=open("Translate_Elem6_batch.jsonl","rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-WuXy3H1qYMRm34Ccu3TX5y', bytes=238118, created_at=1739457049, filename='Translate_Elem6_batch.jsonl', object='file', purpose='batch', status='processed', status_details=None)


# Create the Batch

In [21]:
# 업로드한 배치 입력 파일 ID 저장
batch_input_file_id = batch_input_file.id
print(batch_input_file_id)

file-WuXy3H1qYMRm34Ccu3TX5y


In [22]:
batch_info = client.batches.create(
    input_file_id = batch_input_file_id,
    endpoint = "/v1/chat/completions", # 챗봇 api
    completion_window = "24h", # 현재는 이 기능만 가능
    metadata = {
        "description": "translate"
    }
)

In [23]:
response_json = json.loads(batch_info.json())
response_output = json.dumps(response_json, ensure_ascii=False, indent = 2)
print(response_output)

{
  "id": "batch_67ae021e21bc819095b433a57d1d733b",
  "completion_window": "24h",
  "created_at": 1739457054,
  "endpoint": "/v1/chat/completions",
  "input_file_id": "file-WuXy3H1qYMRm34Ccu3TX5y",
  "object": "batch",
  "status": "validating",
  "cancelled_at": null,
  "cancelling_at": null,
  "completed_at": null,
  "error_file_id": null,
  "errors": null,
  "expired_at": null,
  "expires_at": 1739543454,
  "failed_at": null,
  "finalizing_at": null,
  "in_progress_at": null,
  "metadata": {
    "description": "translate"
  },
  "output_file_id": null,
  "request_counts": {
    "completed": 0,
    "failed": 0,
    "total": 0
  }
}


# Checking the Status of Batch
 - validating : the input file is being validated before the batch can begin
 - failed : the input file has failed the validation process
 - in_progress : the input file was successfully validated and the batch is currently being run
 - finalizing : the batch has completed and the results are being prepared
 - completed : the batch has been completed and the results are ready
 - expired : the batch was not able to be completed within the 24-hour time window
 - cancelling : the batch is being cancelled (may take up to 10 minutes)
 - cancelled : the batch was cancelled

- batch_id = 'batch_67ae019def808190b934416acc8e5567' : translate_elem3
- batch_id = 'batch_67ae01d1e4548190a12e58a11dda92dd' : translate_elem4
- batch_id = 'batch_67ae0204602881908af2968acf76607d' : translate_elem5
- batch_id = 'batch_67ae021e21bc819095b433a57d1d733b' : translate_elem6

In [57]:
batch_id = "batch_67ae021e21bc819095b433a57d1d733b"
batch = client.batches.retrieve(batch_id)
batch_json = json.loads(batch.json())
batch_json

{'id': 'batch_67ae021e21bc819095b433a57d1d733b',
 'completion_window': '24h',
 'created_at': 1739457054,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-WuXy3H1qYMRm34Ccu3TX5y',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1739458603,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1739543454,
 'failed_at': None,
 'finalizing_at': 1739458595,
 'in_progress_at': 1739457054,
 'metadata': {'description': 'translate'},
 'output_file_id': 'file-StaTJLssQVFmt5n2QpmBGY',
 'request_counts': {'completed': 100, 'failed': 0, 'total': 100}}

## 시간 확인

In [44]:
from datetime import datetime
import pytz

kst = pytz.timezone('Asia/Seoul')

# 변환 함수
def convert_timestamp(starttime, endtime):
    st = datetime.utcfromtimestamp(starttime)
    end = datetime.utcfromtimestamp(endtime)
    dt_kst_st = st.replace(tzinfo=pytz.utc).astimezone(kst)
    dt_kst_end = end.replace(tzinfo=pytz.utc).astimezone(kst)

    return f"시작 시간 : {dt_kst_st}, 끝나는 시간 : {dt_kst_end}"

convert_timestamp(starttime = batch_json['created_at'], endtime = batch_json['completed_at'])


  st = datetime.utcfromtimestamp(starttime)
  end = datetime.utcfromtimestamp(endtime)


'시작 시간 : 2025-02-11 18:26:02+09:00, 끝나는 시간 : 2025-02-11 18:58:31+09:00'

# Retrieving the Results
 - file-EVCbFfNykgV9HphK2h9wsk

In [69]:
file_response = client.files.content('file-StaTJLssQVFmt5n2QpmBGY') # output_file_id 이용
print(file_response.text)

{"id": "batch_req_67ae08238ac88190bc43b84a272b0c10", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "6c42c3b52db00df99a7703b6d56ac887", "body": {"id": "chatcmpl-B0UX5nHfPuRf6DbYh8M5KZv0nkVAh", "object": "chat.completion", "created": 1739457067, "model": "o3-mini-2025-01-31", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"key_expression_kor\": \"A: \uba87 \ud559\ub144\uc774\ub2c8? B: \ub098\ub294 3\ud559\ub144\uc774\uc57c.\",\n  \"sub_expression_kor\": \"A: \uba87 \ud559\ub144\uc774\ub2c8? B: \ub098\ub294 ()\ud559\ub144\uc774\uc57c. A: \uba87 \ud559\ub144\uc774\ub2c8? B: \ub098\ub294 ()\ud559\ub144\uc774\uc57c.\"\n}", "refusal": null}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 564, "completion_tokens": 1104, "total_tokens": 1668, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 1024, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_predictio

In [59]:
sample_list = file_response.text.split("\n")
sample_list = [item for item in sample_list if item.strip()]
sample_list

['{"id": "batch_req_67ae08238ac88190bc43b84a272b0c10", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "6c42c3b52db00df99a7703b6d56ac887", "body": {"id": "chatcmpl-B0UX5nHfPuRf6DbYh8M5KZv0nkVAh", "object": "chat.completion", "created": 1739457067, "model": "o3-mini-2025-01-31", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\\n  \\"key_expression_kor\\": \\"A: \\uba87 \\ud559\\ub144\\uc774\\ub2c8? B: \\ub098\\ub294 3\\ud559\\ub144\\uc774\\uc57c.\\",\\n  \\"sub_expression_kor\\": \\"A: \\uba87 \\ud559\\ub144\\uc774\\ub2c8? B: \\ub098\\ub294 ()\\ud559\\ub144\\uc774\\uc57c. A: \\uba87 \\ud559\\ub144\\uc774\\ub2c8? B: \\ub098\\ub294 ()\\ud559\\ub144\\uc774\\uc57c.\\"\\n}", "refusal": null}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 564, "completion_tokens": 1104, "total_tokens": 1668, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 1024, "audio_tokens": 0, "accep

In [60]:
sample_output = []

for sample in sample_list:
    for i in range(len(sample_list)):
        sample_json = json.loads(sample)
        content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
        content_json = json.loads(content)
        sample_output.append(f"{i} 번째")
        sample_output.append(content_json)
        #print(json.dumps(content_json, indent=2, ensure_ascii=False))

# print(json.dumps(sample_output))


# batch 결과

In [61]:
for sample in sample_list:
    sample_json = json.loads(sample)
    content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
    content_json = json.loads(content)
    print(json.dumps(content_json, indent=2, ensure_ascii=False))
    break

{
  "key_expression_kor": "A: 몇 학년이니? B: 나는 3학년이야.",
  "sub_expression_kor": "A: 몇 학년이니? B: 나는 ()학년이야. A: 몇 학년이니? B: 나는 ()학년이야."
}


# CSV 파일로 변환

In [62]:
for sample in sample_list:
    sample_json = json.loads(sample)
    content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
    content_json = json.loads(content)
    # print(json.dumps(content_json, indent=2, ensure_ascii=False))
    # break

In [63]:
content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
content_json = json.loads(content)

In [64]:
def JsonToCSV(data:json, file_name:str) -> json:
    data_list = []

    for sample in data:
        # json 변환환
        sample_json = json.loads(sample)
        content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
        content_json = json.loads(content)

        # expression
        key_expression_kor = content_json["key_expression_kor"]
        sub_expression_kor = content_json["sub_expression_kor"]

        # 데이터 만들기
        csv_data = {
            "Key Expression (한국어)": key_expression_kor,
            "Sub Expression (한국어)": sub_expression_kor
        }

        data_list.append(csv_data)

    df = pd.DataFrame(data_list)

    # CSV 저장
    df.to_csv(f"{file_name}.csv", index=False, encoding='utf-8-sig')

    print(f"CSV 변환 완료 파일명: {file_name}.csv")


In [65]:
name = 'Translate_Mid6'
JsonToCSV(sample_list, name)

CSV 변환 완료 파일명: Translate_Mid6.csv


In [66]:
df = pd.read_csv(f'{name}.csv')
df.head(3)

Unnamed: 0,Key Expression (한국어),Sub Expression (한국어)
0,A: 몇 학년이니? B: 나는 3학년이야.,A: 몇 학년이니? B: 나는 ()학년이야. A: 몇 학년이니? B: 나는 ()학년이야.
1,A: 무엇을 원하시나요? B: 딸기 아이스크림을 주세요.,A: 무엇을 먹고 싶어? B: 나는 ()를 먹고 싶어. A: 무엇이 먹고 싶어? B...
2,A: 네가 가장 좋아하는 학교 과목은 뭐야? B: 내가 가장 좋아하는 과목은 미술이야.,A: 가장 즐기는 수업은 뭐야? B: 내가 가장 좋아하는 과목은 (). A: 제일 ...


# Cancelling a Batch

In [29]:
# cancel_batch = client.batches.cancel("")
# batch_json = json.loads(cancel_batch.json())
# batch_ouput = json.dumps(batch_json, ensure_ascii=False, indent=2)
# print(batch_ouput)