# Uploading Batch Input File
 - batch file 보낼 때 너무 많이 보내지 말고 구분해서 보내기

In [1]:
from openai.lib._parsing import type_to_response_format_param
from pydantic import BaseModel
from getpass import getpass
from openai import OpenAI
import json
import pandas as pd

openai_api_key = getpass("OPENAI_API_KEY")

In [2]:
client = OpenAI(api_key=openai_api_key)

In [3]:
batch_input_file = client.files.create(
    file=open("Structure_High_batch.jsonl","rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-Li8BP1m9ejFCTtZM23dDv4', bytes=1581430, created_at=1739773279, filename='Structure_High_batch.jsonl', object='file', purpose='batch', status='processed', status_details=None)


# Create the Batch

In [4]:
# 업로드한 배치 입력 파일 ID 저장
batch_input_file_id = batch_input_file.id
print(batch_input_file_id)

file-Li8BP1m9ejFCTtZM23dDv4


In [5]:
batch_info = client.batches.create(
    input_file_id = batch_input_file_id,
    endpoint = "/v1/chat/completions", # 챗봇 api
    completion_window = "24h", # 현재는 이 기능만 가능
    metadata = {
        "description": "structure curriculum"
    }
)

In [6]:
response_json = json.loads(batch_info.json())
response_output = json.dumps(response_json, ensure_ascii=False, indent = 2)
print(response_output)

{
  "id": "batch_67b2d5719fc881909780326af63a24ae",
  "completion_window": "24h",
  "created_at": 1739773297,
  "endpoint": "/v1/chat/completions",
  "input_file_id": "file-Li8BP1m9ejFCTtZM23dDv4",
  "object": "batch",
  "status": "validating",
  "cancelled_at": null,
  "cancelling_at": null,
  "completed_at": null,
  "error_file_id": null,
  "errors": null,
  "expired_at": null,
  "expires_at": 1739859697,
  "failed_at": null,
  "finalizing_at": null,
  "in_progress_at": null,
  "metadata": {
    "description": "structure curriculum"
  },
  "output_file_id": null,
  "request_counts": {
    "completed": 0,
    "failed": 0,
    "total": 0
  }
}


# Checking the Status of Batch
 - validating : the input file is being validated before the batch can begin
 - failed : the input file has failed the validation process
 - in_progress : the input file was successfully validated and the batch is currently being run
 - finalizing : the batch has completed and the results are being prepared
 - completed : the batch has been completed and the results are ready
 - expired : the batch was not able to be completed within the 24-hour time window
 - cancelling : the batch is being cancelled (may take up to 10 minutes)
 - cancelled : the batch was cancelled

In [10]:
batch_id = "batch_67b2d5719fc881909780326af63a24ae"
# track_batch_progress(batch_id)
batch = client.batches.retrieve(batch_id)
batch_json = json.loads(batch.json())
batch_json

{'id': 'batch_67b2d5719fc881909780326af63a24ae',
 'completion_window': '24h',
 'created_at': 1739773297,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-Li8BP1m9ejFCTtZM23dDv4',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1739773723,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1739859697,
 'failed_at': None,
 'finalizing_at': 1739773678,
 'in_progress_at': 1739773299,
 'metadata': {'description': 'structure curriculum'},
 'output_file_id': 'file-EqSavjen1mfXPAR3DYyD8R',
 'request_counts': {'completed': 329, 'failed': 0, 'total': 329}}

# Retrieving the Results

*작업*
- output_file_id = '' : structure_high

In [11]:
file_response = client.files.content('file-EqSavjen1mfXPAR3DYyD8R') # output_file_id 이용
print(file_response.text)

{"id": "batch_req_67b2d6ef32748190b9382858413a9f66", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "1dad14317ad88a0453e59be31aa6b6aa", "body": {"id": "chatcmpl-B1ooSy0iGi0P6hRMYAnglMXWtVxko", "object": "chat.completion", "created": 1739773352, "model": "o3-mini-2025-01-31", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"SUBJECT\": \"\uc601\uc5b4\",\n  \"PUBLISHER\": \"\ub2a5\ub960(\uae40)\",\n  \"EDUCATION\": 2015,\n  \"LESSON\": 1,\n  \"TITLE\": \"The Part You Play\",\n  \"DESCRIPTION\": \"\uc804\uce58\uc0ac\uc758 \ubaa9\uc801\uc5b4\ub85c\uc11c\uc758 \ub3d9\uba85\uc0ac\",\n  \"KEY_EXPRESSION\": \"I enjoy playing in team sports.\",\n  \"KEY_EXPRESSION_Kor\": \"\ub098\ub294 \ud300 \uc2a4\ud3ec\uce20\ub97c \uc990\uae30\ub294 \uac83\uc744 \uc88b\uc544\ud574\uc694.\",\n  \"SUB_EXPRESSION\": \"She is excited about scoring goals in matches. He is nervous about competing in the tournament.\",\n  \"SUB_EXPRESSION_Kor\": \"\uadf8\ub140\ub

In [12]:
sample_list = file_response.text.split("\n")
sample_list = [item for item in sample_list if item.strip()]

In [13]:
sample_output = []

for sample in sample_list:
    for i in range(len(sample_list)):
        sample_json = json.loads(sample)
        content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
        content_json = json.loads(content)
        sample_output.append(f"{i} 번째")
        sample_output.append(content_json)
        #print(json.dumps(content_json, indent=2, ensure_ascii=False))

# print(json.dumps(sample_output))


# batch 결과

In [14]:
for s in sample_list:
    s_json = json.loads(s)
    contents = s_json['response']['body']
    print(contents)
    break

{'id': 'chatcmpl-B1ooSy0iGi0P6hRMYAnglMXWtVxko', 'object': 'chat.completion', 'created': 1739773352, 'model': 'o3-mini-2025-01-31', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{\n  "SUBJECT": "영어",\n  "PUBLISHER": "능률(김)",\n  "EDUCATION": 2015,\n  "LESSON": 1,\n  "TITLE": "The Part You Play",\n  "DESCRIPTION": "전치사의 목적어로서의 동명사",\n  "KEY_EXPRESSION": "I enjoy playing in team sports.",\n  "KEY_EXPRESSION_Kor": "나는 팀 스포츠를 즐기는 것을 좋아해요.",\n  "SUB_EXPRESSION": "She is excited about scoring goals in matches. He is nervous about competing in the tournament.",\n  "SUB_EXPRESSION_Kor": "그녀는 경기에서 골을 넣는 것이 신나요. 그는 토너먼트에 참가하는 것이 떨려요."\n}', 'refusal': None}, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 1016, 'completion_tokens': 493, 'total_tokens': 1509, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 320, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'serv

In [15]:
for sample in sample_list:
    sample_json = json.loads(sample)
    content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
    content_json = json.loads(content)
    print(json.dumps(content_json, indent=2, ensure_ascii=False))
    break
    

{
  "SUBJECT": "영어",
  "PUBLISHER": "능률(김)",
  "EDUCATION": 2015,
  "LESSON": 1,
  "TITLE": "The Part You Play",
  "DESCRIPTION": "전치사의 목적어로서의 동명사",
  "KEY_EXPRESSION": "I enjoy playing in team sports.",
  "KEY_EXPRESSION_Kor": "나는 팀 스포츠를 즐기는 것을 좋아해요.",
  "SUB_EXPRESSION": "She is excited about scoring goals in matches. He is nervous about competing in the tournament.",
  "SUB_EXPRESSION_Kor": "그녀는 경기에서 골을 넣는 것이 신나요. 그는 토너먼트에 참가하는 것이 떨려요."
}


# CSV 파일로 변환

In [16]:
for sample in sample_list:
    sample_json = json.loads(sample)
    content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
    content_json = json.loads(content)
    print(json.dumps(content_json, indent=2, ensure_ascii=False))
    break

{
  "SUBJECT": "영어",
  "PUBLISHER": "능률(김)",
  "EDUCATION": 2015,
  "LESSON": 1,
  "TITLE": "The Part You Play",
  "DESCRIPTION": "전치사의 목적어로서의 동명사",
  "KEY_EXPRESSION": "I enjoy playing in team sports.",
  "KEY_EXPRESSION_Kor": "나는 팀 스포츠를 즐기는 것을 좋아해요.",
  "SUB_EXPRESSION": "She is excited about scoring goals in matches. He is nervous about competing in the tournament.",
  "SUB_EXPRESSION_Kor": "그녀는 경기에서 골을 넣는 것이 신나요. 그는 토너먼트에 참가하는 것이 떨려요."
}


In [17]:
content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
content_json = json.loads(content)

In [18]:
content_json

{'SUBJECT': '영어',
 'PUBLISHER': '능률(김)',
 'EDUCATION': 2015,
 'LESSON': 1,
 'TITLE': 'The Part You Play',
 'DESCRIPTION': '전치사의 목적어로서의 동명사',
 'KEY_EXPRESSION': 'I enjoy playing in team sports.',
 'KEY_EXPRESSION_Kor': '나는 팀 스포츠를 즐기는 것을 좋아해요.',
 'SUB_EXPRESSION': 'She is excited about scoring goals in matches. He is nervous about competing in the tournament.',
 'SUB_EXPRESSION_Kor': '그녀는 경기에서 골을 넣는 것이 신나요. 그는 토너먼트에 참가하는 것이 떨려요.'}

In [21]:
def JsonToCSV(data:json, file_name:str) -> json:
    data_list = []

    for sample in data:
        # json 변환
        sample_json = json.loads(sample)
        content = sample_json["response"]["body"]["choices"][0]["message"]["content"]
        content_json = json.loads(content)

        # 제목
        subject = content_json['SUBJECT']

        # publisher
        publisher = content_json['PUBLISHER']

        # education
        education = content_json['EDUCATION']

        # lesson
        lesson = content_json['LESSON']

        # title
        title = content_json['TITLE']

        # description
        description = content_json['DESCRIPTION']

        # key expression
        key_expression = content_json['KEY_EXPRESSION']

        # key expression kor
        key_expression_kor = content_json['KEY_EXPRESSION_Kor']

        # sub expression
        sub_expression = content_json['SUB_EXPRESSION']

        # sub expression kor
        sub_expression_kor = content_json['SUB_EXPRESSION_Kor']


        # 데이터 만들기
        csv_data = {
            "제목" : subject,
            "출판사(저자)" : publisher,
            "교육과정" : education,
            "Lesson" : lesson,
            "Title" : title,
            "DESCRIPTION": description,
            "KEY_EXPRESSION" : key_expression,
            "KEY_EXPRESSION_Kor" : key_expression_kor,
            "SUB_EXPRESSION" : sub_expression,
            "SUB_EXPRESSION_Kor" : sub_expression_kor
        }

        data_list.append(csv_data)  # 리스트에 추가!

    df = pd.DataFrame(data_list)

    # CSV 저장
    df.to_csv(f"{file_name}.csv", index=False, encoding='utf-8-sig')

    print(f"CSV 변환 완료 파일명: {file_name}.csv")


In [22]:
name = 'Structure_Curriculum_High'
JsonToCSV(sample_list, name)

CSV 변환 완료 파일명: Structure_Curriculum_High.csv


In [23]:
df = pd.read_csv(f'{name}.csv')
df.head(3)

Unnamed: 0,제목,출판사(저자),교육과정,Lesson,Title,DESCRIPTION,KEY_EXPRESSION,KEY_EXPRESSION_Kor,SUB_EXPRESSION,SUB_EXPRESSION_Kor
0,영어,능률(김),2015,1,The Part You Play,전치사의 목적어로서의 동명사,I enjoy playing in team sports.,나는 팀 스포츠를 즐기는 것을 좋아해요.,She is excited about scoring goals in matches....,그녀는 경기에서 골을 넣는 것이 신나요. 그는 토너먼트에 참가하는 것이 떨려요.
1,영어,능률(김),2015,2,The Power of Creativity,명사를 수식하는 과거분사(구),The old bike fixed by my father is working smo...,아버지께서 고쳐주신 오래된 자전거가 부드럽게 작동해요.,The delicious cake baked by my teacher won fir...,선생님께서 구워주신 맛있는 케이크가 1등을 했어요. 학생이 쓴 감동적인 이야기가 많...
2,영어,능률(김),2015,3,Sound Life,사역동사+목적어+동사원형,They let me choose a beautiful song.,그들은 내가 아름다운 노래를 선택하게 해줬어요.,My teacher made me listen to a cheerful melody...,선생님은 내가 기분 좋은 멜로디를 듣게 했어요. 내 친구는 내가 상쾌한 커피를 마시...
