# Voca 초딩 예문 양산

In [1]:
from pydantic import BaseModel
from openai import OpenAI
from openai.lib._parsing import type_to_response_format_param
import json
from getpass import getpass

openai_api_key = getpass("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

In [2]:
from jinja2 import Template

prompt_template = Template("""
[PERSONA]
You are going to be given a set of English words and some of their Korean meaning. As an elementary school English teacher, generate English sentences using each of those words, and provide Korean translation, too. You must refer to the given Korean meaning to generate the examples. Overall sentences must be very easy and kids-friendly. You must refer to the given Korean meaning to generate the examples.


[POLICY]
1. A sentence must not be over 6 words.
2. Separate the sentence with a tab in between.
3. For Korean translation, use '~요' at the end of each translated sentence to soften the tone. Make it natural.
4. The whole data must strictly follow CEFR A1 level.

[EXAMPLE]
```
Input:
    {
    "voca_NO" : 123
    "WordText" : "cat",
    "WordMeaning" : "고양이",
    "PoSText" : "명사"
    }

Output:
    {
     "example_sentence" : "The cat sleeps on the bed.",
     "example_sentence_kor" : "고양이가 침대에서 자요."
    }
```                           
                           
[Input]
```
voca_NO : {{voca_NO}}
WordText : {{WordText}}
WordMeaning : {{WordMeaning}}
PoSText : {{PoSText}}
```                   
"""                                    

)

In [3]:
prompt = prompt_template.render(
    voca_NO = "123",
    WordText = "alarm",
    WordMeaning = "놀람,경보",
    PoSText = "명사",
)

In [4]:
print(prompt)


[PERSONA]
You are going to be given a set of English words and some of their Korean meaning. As an elementary school English teacher, generate English sentences using each of those words, and provide Korean translation, too. You must refer to the given Korean meaning to generate the examples. Overall sentences must be very easy and kids-friendly. You must refer to the given Korean meaning to generate the examples.


[POLICY]
1. A sentence must not be over 6 words.
2. Separate the sentence with a tab in between.
3. For Korean translation, use '~요' at the end of each translated sentence to soften the tone. Make it natural.
4. The whole data must strictly follow CEFR A1 level.

[EXAMPLE]
```
Input:
    {
    "voca_NO" : 123
    "WordText" : "cat",
    "WordMeaning" : "고양이",
    "PoSText" : "명사"
    }

Output:
    {
     "example_sentence" : "The cat sleeps on the bed.",
     "example_sentence_kor" : "고양이가 침대에서 자요."
    }
```                           
                           
[Input]


In [5]:
class Example(BaseModel):
    example_sentence : str
    example_sentence_kor : str

In [6]:
response_format = type_to_response_format_param(Example)

In [7]:
response_format

{'type': 'json_schema',
 'json_schema': {'schema': {'properties': {'example_sentence': {'title': 'Example Sentence',
     'type': 'string'},
    'example_sentence_kor': {'title': 'Example Sentence Kor',
     'type': 'string'}},
   'required': ['example_sentence', 'example_sentence_kor'],
   'title': 'Example',
   'type': 'object',
   'additionalProperties': False},
  'name': 'Example',
  'strict': True}}

In [30]:
def completion(prompt : str) -> str:
    response = client.beta.chat.completions.parse(
        model = 'o3-mini',
        reasoning_effort='low',
        messages = [
            {"role" : "system", "content" : "Generate an example sentence for each word."},
            {"role" : "user", "content" : prompt}
        ],
        response_format = Example,
    )
    return response.choices[0].message.parsed

In [13]:
response = completion(prompt)

In [31]:
print(response)

example_sentence='The alarm rings at seven in the morning.' example_sentence_kor='아침마다 7시에 경보가 울려요.'


In [32]:
response_output = json.dumps(response.dict(), ensure_ascii=False, indent = 4)
print(response_output)

{
    "example_sentence": "The alarm rings at seven in the morning.",
    "example_sentence_kor": "아침마다 7시에 경보가 울려요."
}


# for문 돌려서 Batch API jsonl 파일 형식 만들기

In [48]:
def SentenceMake(data, output_filename):
    jsonl_data = []

    for i in range(len(data)):
        prompt = prompt_template.render(
            id = data.loc[i,'voca NO'],
            Wordtext = data.loc[i,"WordText"],
            WordMeaning = data.loc[i,"WordMeaning"],
            PoSText = data.loc[i,"품사"],
        )

        quiz_request = {
            "custom_id" : f"request-{i+1}",
            "method" : "POST",
            "url" : "/v1/chat/completions",
            "body" : {
                "model" : "o3-mini-2025-01-31",
                "messages" : [
                    {"role": "system", "content": "Generate easy example sentences."},
                    {"role": "user", "content": prompt}
                ],
                "response_format" : response_format
            }
        }

        jsonl_data.append(quiz_request)

        with open(output_filename, 'w', encoding='utf-8') as jsonl_file:
            for item in jsonl_data:
                jsonl_file.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f'JSONL 파일 생성 완료 : {output_filename}-{i+1}')

In [49]:
import pandas as pd

df = pd.read_csv('Final_Voca_MidHigh.csv', skiprows=2)
print(len(df))
df1 = df[:1000]
print(len(df1))
df2 = df[1000:2000]
print(len(df2))
df3 = df[2000:3000]
print(len(df3))
df4 = df[3000:4000]
print(len(df4))
df5 = df[4000:5000]
print(len(df5))
df6 = df[5000:6000]
print(len(df6))
df7 = df[6000:]
print(len(df7))

print(f'total : {len(df1) + len(df2) + len(df3) + len(df4) + len(df5) + len(df6) + len(df7)}')

#인덱스 리셋
df2 = df2.reset_index()
df3 = df3.reset_index()
df4 = df4.reset_index()
df5 = df5.reset_index()
df6 = df6.reset_index()
df7 = df7.reset_index()

7251
1000
1000
1000
1000
1000
1000
1251
total : 7251


In [51]:
SentenceMake(df1,"VocaList_batch_1.jsonl")
SentenceMake(df2,"VocaList_batch_2.jsonl")
SentenceMake(df3,"VocaList_batch_3.jsonl")
SentenceMake(df4,"VocaList_batch_4.jsonl")
SentenceMake(df4,"VocaList_batch_5.jsonl")
SentenceMake(df4,"VocaList_batch_6.jsonl")
SentenceMake(df4,"VocaList_batch_7.jsonl")

JSONL 파일 생성 완료 : VocaList_batch_1.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_2.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_3.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_4.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_5.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_6.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_7.jsonl-1000
