# Uploading Batch Input File
 - batch file 보낼 때 너무 많이 보내지 말고 구분해서 보내기

In [2]:
from getpass import getpass

openai_api_key = getpass("OPENAI_API_KEY")

In [3]:
from openai import OpenAI
import json
import pandas as pd

client = OpenAI(api_key=openai_api_key)

In [64]:
batch_input_file = client.files.create(
    file=open("VocaQuiz_MidHigh_batch_7.jsonl","rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-PAqfvpFeYJxUgDdgVYwUkC', bytes=6354030, created_at=1740034767, filename='VocaQuiz_MidHigh_batch_7.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)


# Create the Batch

In [65]:
# 업로드한 배치 입력 파일 ID 저장
batch_input_file_id = batch_input_file.id
print(batch_input_file_id)

file-PAqfvpFeYJxUgDdgVYwUkC


In [66]:
batch_info = client.batches.create(
    input_file_id = batch_input_file_id,
    endpoint = "/v1/chat/completions", # 챗봇 api
    completion_window = "24h", # 현재는 이 기능만 가능
    metadata = {
        "description": "voca quiz mid high 7"
    }
)

In [67]:
response_json = json.loads(batch_info.json())
response_output = json.dumps(response_json, ensure_ascii=False, indent = 2)
print(response_output)

{
  "id": "batch_67b6d2d5a6b881908a98870531ae6ed4",
  "completion_window": "24h",
  "created_at": 1740034773,
  "endpoint": "/v1/chat/completions",
  "input_file_id": "file-PAqfvpFeYJxUgDdgVYwUkC",
  "object": "batch",
  "status": "validating",
  "cancelled_at": null,
  "cancelling_at": null,
  "completed_at": null,
  "error_file_id": null,
  "errors": null,
  "expired_at": null,
  "expires_at": 1740121173,
  "failed_at": null,
  "finalizing_at": null,
  "in_progress_at": null,
  "metadata": {
    "description": "voca quiz mid high 7"
  },
  "output_file_id": null,
  "request_counts": {
    "completed": 0,
    "failed": 0,
    "total": 0
  }
}


# Checking the Status of Batch
 - validating : the input file is being validated before the batch can begin
 - failed : the input file has failed the validation process
 - in_progress : the input file was successfully validated and the batch is currently being run
 - finalizing : the batch has completed and the results are being prepared
 - completed : the batch has been completed and the results are ready
 - expired : the batch was not able to be completed within the 24-hour time window
 - cancelling : the batch is being cancelled (may take up to 10 minutes)
 - cancelled : the batch was cancelled

- batch_id = 'batch_67b687078a3c8190acc1ea2a6c52ef93' : voca quiz 1 : 
- batch_id = 'batch_67b69ba135008190a0935fcd239d129d' : voca quiz 2 : 
- batch_id = 'batch_67b6d26d04148190a91798d78bb25cd9' : voca quiz 3 : 
- batch_id = 'batch_67b6d2873d98819090d5caf51c21afd6' : voca quiz 4 : 
- batch_id = 'batch_67b6d2a1f024819087b4459427257a92' : voca quiz 5 : 
- batch_id = 'batch_67b6d2bc2f9c819091be3003f839cd44' : voca quiz 6 : 
- batch_id = 'batch_67b6d2d5a6b881908a98870531ae6ed4' : voca quiz 7 : 


In [98]:
batch_id = "batch_67b6d2a1f024819087b4459427257a92"
batch = client.batches.retrieve(batch_id)
batch_json = json.loads(batch.json())
batch_json

{'id': 'batch_67b6d2a1f024819087b4459427257a92',
 'completion_window': '24h',
 'created_at': 1740034722,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-MyxoTiD2PJTra49yAoJdfJ',
 'object': 'batch',
 'status': 'in_progress',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': None,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1740121122,
 'failed_at': None,
 'finalizing_at': None,
 'in_progress_at': 1740034724,
 'metadata': {'description': 'voca quiz mid high 5'},
 'output_file_id': None,
 'request_counts': {'completed': 0, 'failed': 0, 'total': 1000}}

## 시간 확인

In [81]:
from datetime import datetime
import pytz

kst = pytz.timezone('Asia/Seoul')
st = datetime.utcfromtimestamp(1740015367)
dt_kst_st = st.replace(tzinfo=pytz.utc).astimezone(kst)
print(dt_kst_st)


2025-02-20 10:36:07+09:00


  st = datetime.utcfromtimestamp(1740015367)


In [None]:
from datetime import datetime
import pytz

kst = pytz.timezone('Asia/Seoul')

# 변환 함수
def convert_timestamp(starttime, endtime):
    st = datetime.utcfromtimestamp(starttime)
    end = datetime.utcfromtimestamp(endtime)
    dt_kst_st = st.replace(tzinfo=pytz.utc).astimezone(kst)
    dt_kst_end = end.replace(tzinfo=pytz.utc).astimezone(kst)

    return f"시작 시간 : {dt_kst_st}, 끝나는 시간 : {dt_kst_end}"

convert_timestamp(starttime = batch_json['created_at'], endtime = batch_json['completed_at'])


  st = datetime.utcfromtimestamp(starttime)
  end = datetime.utcfromtimestamp(endtime)


'시작 시간 : 2025-02-19 14:19:50+09:00, 끝나는 시간 : 2025-02-19 14:35:46+09:00'

# Retrieving the Results
 - file-EVCbFfNykgV9HphK2h9wsk

In [91]:
file_response = client.files.content('file-3MXFaEzzpTTskT9zAg4CjC') # output_file_id 이용
print(file_response.text)

{"id": "batch_req_67b576c07d508190b4e2af5aa74c36f5", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "5334ac4fe19cf0f9e53c42a6b0ccc2e7", "body": {"id": "chatcmpl-B2WmcBOGFkSDhe9vejmrd23uobKY8", "object": "chat.completion", "created": 1739942374, "model": "o3-mini-2025-01-31", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"voca_quizzes\": [\n    {\n      \"fill_in_blank_quiz\": \"She speaks with a unique ___.\",\n      \"korean_translation\": \"\uadf8\ub140\ub294 \ub3c5\ud2b9\ud55c [[\uac15\uc138]]\ub85c \ub9d0\ud574\uc694.\",\n      \"answer\": \"accent\"\n    }\n  ],\n  \"hint\": [\n    {\n      \"hint_eng\": \"It is how someone pronounces words differently.\",\n      \"hint_kor\": \"\ub2e4\ub978 \uc9c0\uc5ed \uc0ac\ub78c\ub4e4\uc758 \ub9d0\ud558\ub294 \ubc29\uc2dd\uc774\uc5d0\uc694.\"\n    }\n  ]\n}", "refusal": null}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 1029, "completion_tokens": 1780, "total_tokens": 2809, "pr

In [92]:
list = file_response.text.split("\n")
list = [item for item in list if item.strip()]
len(list)

1000

# batch 결과

In [93]:
for sample in list:
    sample_json = json.loads(sample)
    content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
    content_json = json.loads(content)
    print(json.dumps(content_json, indent=2, ensure_ascii=False))
    break

{
  "voca_quizzes": [
    {
      "fill_in_blank_quiz": "She speaks with a unique ___.",
      "korean_translation": "그녀는 독특한 [[강세]]로 말해요.",
      "answer": "accent"
    }
  ],
  "hint": [
    {
      "hint_eng": "It is how someone pronounces words differently.",
      "hint_kor": "다른 지역 사람들의 말하는 방식이에요."
    }
  ]
}


# CSV 파일로 변환

In [94]:
for sample in list:
    sample_json = json.loads(sample)
    content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
    content_json = json.loads(content)
    print(json.dumps(content_json, indent=2, ensure_ascii=False))
    break

{
  "voca_quizzes": [
    {
      "fill_in_blank_quiz": "She speaks with a unique ___.",
      "korean_translation": "그녀는 독특한 [[강세]]로 말해요.",
      "answer": "accent"
    }
  ],
  "hint": [
    {
      "hint_eng": "It is how someone pronounces words differently.",
      "hint_kor": "다른 지역 사람들의 말하는 방식이에요."
    }
  ]
}


In [95]:
def JsonToCSV(data:json, file_name:str) -> json:
    data_list = []

    for sample in data:
        # json 변환환
        sample_json = json.loads(sample)
        content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
        content_json = json.loads(content)

        # voca_quizzes
        fill_in_blank_quiz = content_json['voca_quizzes'][0]['fill_in_blank_quiz']
        korean_translation = content_json['voca_quizzes'][0]['korean_translation']
        answer = content_json['voca_quizzes'][0]['answer']

        # hint
        hint_eng = content_json['hint'][0]['hint_eng']
        hint_kor = content_json['hint'][0]['hint_kor']

        # 데이터 만들기
        csv_data = {
            "fill_in_blank_quiz" : fill_in_blank_quiz,
            "korean_translation" : korean_translation,
            "answer" : answer,
            "hint_eng" : hint_eng,
            "hint_kor" : hint_kor
        }

        data_list.append(csv_data)

    df = pd.DataFrame(data_list)

    # CSV 저장
    df.to_csv(f"{file_name}.csv", index=False, encoding='utf-8-sig')

    print(f"CSV 변환 완료 파일명: {file_name}.csv")


In [None]:
name = "Voca_QUIZ_1.csv"
JsonToCSV(list, name)

CSV 변환 완료 파일명: Voca_QUIZ_1.csv.csv


In [102]:
df1 = pd.read_csv('Voca_QUIZ_1.csv.csv')
df2 = pd.read_csv('Voca_QUIZ_2.csv.csv')
df3 = pd.read_csv('Voca_QUIZ_3.csv.csv')
df4 = pd.read_csv('Voca_QUIZ_4.csv.csv')
df5 = pd.read_csv('Voca_QUIZ_5.csv.csv')
df6 = pd.read_csv('Voca_QUIZ_6.csv.csv')
df7 = pd.read_csv('Voca_QUIZ_7.csv.csv')

print(len(df1))
print(len(df2))
print(len(df3))
print(len(df4))
print(len(df5))
print(len(df6))
print(len(df7))

1000
1000
1000
1000
1000
1000
1251


# 데이터 합치기

In [103]:
df1 = pd.read_csv('Voca_QUIZ_1.csv.csv')
df2 = pd.read_csv('Voca_QUIZ_2.csv.csv')
df3 = pd.read_csv('Voca_QUIZ_3.csv.csv')
df4 = pd.read_csv('Voca_QUIZ_4.csv.csv')
df5 = pd.read_csv('Voca_QUIZ_5.csv.csv')
df6 = pd.read_csv('Voca_QUIZ_6.csv.csv')
df7 = pd.read_csv('Voca_QUIZ_7.csv.csv')

merged_data = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=0, ignore_index=True)
merged_data.shape
merged_data.to_csv("Quiz_Voca_Mid(total).csv", encoding="utf-8-sig")

# Cancelling a Batch

In [113]:
cancel_batch = client.batches.cancel("batch_67b5a5895580819098fff96c0e7e4eb9")
batch_json = json.loads(cancel_batch.json())
batch_ouput = json.dumps(batch_json, ensure_ascii=False, indent=2)
print(batch_ouput)

{
  "id": "batch_67b5a5895580819098fff96c0e7e4eb9",
  "completion_window": "24h",
  "created_at": 1739957641,
  "endpoint": "/v1/chat/completions",
  "input_file_id": "file-QgZjsfYUcxubyFYbhbjxzx",
  "object": "batch",
  "status": "cancelling",
  "cancelled_at": null,
  "cancelling_at": 1739958196,
  "completed_at": null,
  "error_file_id": null,
  "errors": null,
  "expired_at": null,
  "expires_at": 1740044041,
  "failed_at": null,
  "finalizing_at": null,
  "in_progress_at": 1739957643,
  "metadata": {
    "description": "voca quiz mid high 5"
  },
  "output_file_id": null,
  "request_counts": {
    "completed": 643,
    "failed": 0,
    "total": 1000
  }
}
