# Voca 중고딩 예문 양산

In [30]:
from pydantic import BaseModel
from openai import OpenAI
from openai.lib._parsing import type_to_response_format_param
import json
from getpass import getpass

openai_api_key = getpass("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

In [31]:
from jinja2 import Template

prompt_template = Template("""
[PERSONA]
You are going to be given a set of English words and some of their Korean meaning. As an middle and high school English teacher, generate English sentences using each of those words, and provide Korean translation, too. You must refer to the given Korean meaning to generate the examples. Overall sentences must be very easy and kids-friendly. You must refer to the given Korean meaning to generate the examples.

[POLICY]
1. A sentence must be about 8 words.
2. Separate the sentence with a tab in between.
3. For Korean translation, use '~요' at the end of each translated sentence to soften the tone. Make it natural.
4. The whole data must strictly follow CEFR A2 level.


[EXAMPLE]
```
Input:
    {
    "voca_NO" : 123
    "WordText" : "challenge",
    "WordMeaning" : "난제",
    "PoSText" : "명사"
    }

Output:
    {
     "example_sentence" : "Math problems can be a big challenge.",
     "example_sentence_kor" : "수학 문제는 큰 난제가 될 수 있어요."
    }
```                           
                           
[Input]
```
voca_NO : {{voca_NO}}
WordText : {{WordText}}
WordMeaning : {{WordMeaning}}
PoSText : {{PoSText}}
```                   
"""              

)

In [32]:
prompt = prompt_template.render(
    voca_NO = "123",
    WordText = "alarm",
    WordMeaning = "놀람,경보",
    PoSText = "명사",
)

In [33]:
print(prompt)


[PERSONA]
You are going to be given a set of English words and some of their Korean meaning. As an middle and high school English teacher, generate English sentences using each of those words, and provide Korean translation, too. You must refer to the given Korean meaning to generate the examples. Overall sentences must be very easy and kids-friendly. You must refer to the given Korean meaning to generate the examples.

[POLICY]
1. A sentence must be about 8 words.
2. Separate the sentence with a tab in between.
3. For Korean translation, use '~요' at the end of each translated sentence to soften the tone. Make it natural.
4. The whole data must strictly follow CEFR A2 level.


[EXAMPLE]
```
Input:
    {
    "voca_NO" : 123
    "WordText" : "challenge",
    "WordMeaning" : "난제",
    "PoSText" : "명사"
    }

Output:
    {
     "example_sentence" : "Math problems can be a big challenge.",
     "example_sentence_kor" : "수학 문제는 큰 난제가 될 수 있어요."
    }
```                           
          

In [34]:
class Example(BaseModel):
    example_sentence : str
    example_sentence_kor : str

In [35]:
response_format = type_to_response_format_param(Example)

In [36]:
response_format

{'type': 'json_schema',
 'json_schema': {'schema': {'properties': {'example_sentence': {'title': 'Example Sentence',
     'type': 'string'},
    'example_sentence_kor': {'title': 'Example Sentence Kor',
     'type': 'string'}},
   'required': ['example_sentence', 'example_sentence_kor'],
   'title': 'Example',
   'type': 'object',
   'additionalProperties': False},
  'name': 'Example',
  'strict': True}}

In [37]:
def completion(prompt : str) -> str:
    response = client.beta.chat.completions.parse(
        model = 'o3-mini',
        reasoning_effort='low',
        messages = [
            {"role" : "system", "content" : "Generate an example sentence for each word."},
            {"role" : "user", "content" : prompt}
        ],
        response_format = Example,
    )
    return response.choices[0].message.parsed

In [38]:
response = completion(prompt)

In [39]:
print(response)

example_sentence='The\talarm\tring\talways\tstartles\tmy\tsmall\tcat.' example_sentence_kor='경보음은\t언제나\t제\t작은\t고양이를\t놀라게\t해요.'


In [40]:
response_output = json.dumps(response.dict(), ensure_ascii=False, indent = 4)
print(response_output)

{
    "example_sentence": "The\talarm\tring\talways\tstartles\tmy\tsmall\tcat.",
    "example_sentence_kor": "경보음은\t언제나\t제\t작은\t고양이를\t놀라게\t해요."
}


# for문 돌려서 Batch API jsonl 파일 형식 만들기

In [41]:
def SentenceMake(data, output_filename):
    jsonl_data = []

    for i in range(len(data)):
        prompt = prompt_template.render(
            voca_NO = data.loc[i,'voca NO'],
            WordText = data.loc[i,"WordText"],
            WordMeaning = data.loc[i,"WordMeaning"],
            PoSText = data.loc[i,"품사"],
        )

        quiz_request = {
            "custom_id" : f"request-{i+1}",
            "method" : "POST",
            "url" : "/v1/chat/completions",
            "body" : {
                "model" : "o3-mini-2025-01-31",
                "messages" : [
                    {"role": "system", "content": "Generate easy example sentences."},
                    {"role": "user", "content": prompt}
                ],
                "response_format" : response_format
            }
        }

        jsonl_data.append(quiz_request)

        with open(output_filename, 'w', encoding='utf-8') as jsonl_file:
            for item in jsonl_data:
                jsonl_file.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f'JSONL 파일 생성 완료 : {output_filename}-{i+1}')

In [42]:
import pandas as pd

df = pd.read_csv('Final_Voca_MidHigh.csv', skiprows=2)
print(len(df))
df1 = df[:1000]
print(len(df1))
df2 = df[1000:2000]
print(len(df2))
df3 = df[2000:3000]
print(len(df3))
df4 = df[3000:4000]
print(len(df4))
df5 = df[4000:5000]
print(len(df5))
df6 = df[5000:6000]
print(len(df6))
df7 = df[6000:]
print(len(df7))

print(f'total : {len(df1) + len(df2) + len(df3) + len(df4) + len(df5) + len(df6) + len(df7)}')

#인덱스 리셋
df2 = df2.reset_index()
df3 = df3.reset_index()
df4 = df4.reset_index()
df5 = df5.reset_index()
df6 = df6.reset_index()
df7 = df7.reset_index()

7251
1000
1000
1000
1000
1000
1000
1251
total : 7251


In [44]:
SentenceMake(df1,"VocaList_batch_1.jsonl")
SentenceMake(df2,"VocaList_batch_2.jsonl")
SentenceMake(df3,"VocaList_batch_3.jsonl")
SentenceMake(df4,"VocaList_batch_4.jsonl")
SentenceMake(df5,"VocaList_batch_5.jsonl")
SentenceMake(df6,"VocaList_batch_6.jsonl")
SentenceMake(df7,"VocaList_batch_7.jsonl")

JSONL 파일 생성 완료 : VocaList_batch_1.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_2.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_3.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_4.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_5.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_6.jsonl-1000
JSONL 파일 생성 완료 : VocaList_batch_7.jsonl-1251


# 범용 데이터 전처리

In [66]:
import pandas as pd
df = pd.read_csv("Final_ExampleSentence_Voca_MidHigh.csv", skiprows=2)
df.columns

Index(['구분', 'voca NO', 'grade ID', 'grade NO', 'WordText', 'WordMeaning',
       'PoS', '품사', 'image ID', 'WordID(SoundID)', 'WordSymbol',
       'example_english', 'example_korean', 'new_example_english',
       'new_example_korean', 'fill_in_blank_quiz', 'korean_translation',
       'answer', 'hint_eng', 'hint_kor', '중고등1200 포함여부', '예시 단어 포함 여부',
       'answer/wordtext 일치여부', 'Unnamed: 23', 'Unnamed: 24', '사운드파일명', '사운드배치',
       '이미지파일명', '이미지배치', 'Unnamed: 29', 'Unnamed: 30', '고등교과서 gradeID 업데이트'],
      dtype='object')

In [67]:
df.head(1)

Unnamed: 0,구분,voca NO,grade ID,grade NO,WordText,WordMeaning,PoS,품사,image ID,WordID(SoundID),...,answer/wordtext 일치여부,Unnamed: 23,Unnamed: 24,사운드파일명,사운드배치,이미지파일명,이미지배치,Unnamed: 29,Unnamed: 30,고등교과서 gradeID 업데이트
0,중고등1200,1,중등,9,accent,강세,2,명사,5094,14274,...,True,,,14274,XCOPY /F/Y .\014274.mp3 .\cutMp3\,5094,XCOPY /F/Y .\005094.png .\cutPng\,,,


In [68]:
drop_col = ['구분', 'image ID', 'WordID(SoundID)', 'WordSymbol',
       'example_english', 'example_korean', 'new_example_english',
       'new_example_korean', 'fill_in_blank_quiz', 'korean_translation',
       'answer', 'hint_eng', 'hint_kor', '중고등1200 포함여부', '예시 단어 포함 여부',
       'answer/wordtext 일치여부', 'Unnamed: 23', 'Unnamed: 24', '사운드파일명', '사운드배치',
       '이미지파일명', '이미지배치', 'Unnamed: 29', 'Unnamed: 30', '고등교과서 gradeID 업데이트', 'grade ID', 'grade NO', 'PoS']
df.drop(columns=drop_col, inplace=True)
df.head(3)

Unnamed: 0,voca NO,WordText,WordMeaning,품사
0,1,accent,강세,명사
1,2,accent,억양,명사
2,3,advertise,광고하다,동사


In [69]:
df = df.rename({"품사" : "PoSText",
                "voca NO" : "id"}, axis=1)
df.head(3)

Unnamed: 0,id,WordText,WordMeaning,PoSText
0,1,accent,강세,명사
1,2,accent,억양,명사
2,3,advertise,광고하다,동사


In [70]:
df = df[7251:]
df = df.reset_index(drop=True)
print(f"총 사이즈 {len(df)}")
df1 = df[:1000]
df2 = df[1000:2000]
df3 = df[2000:3000]
df4 = df[3000:4000]
df5 = df[4000:5000]
df6 = df[5000:]

print(f"df1 사이즈 {len(df1)}")
print(f"df2 사이즈 {len(df2)}")
print(f"df3 사이즈 {len(df3)}")
print(f"df4 사이즈 {len(df4)}")
print(f"df5 사이즈 {len(df5)}")
print(f"df6 사이즈 {len(df6)}")

총 사이즈 5486
df1 사이즈 1000
df2 사이즈 1000
df3 사이즈 1000
df4 사이즈 1000
df5 사이즈 1000
df6 사이즈 486


In [71]:
df6.head()

Unnamed: 0,id,WordText,WordMeaning,PoSText
5000,24652,stimulating,"자극하는,활기를 주는",형용사
5001,24653,stir up,문제를 일으키다,숙어/관용어
5002,24654,store,"가게,상점",명사
5003,24655,store,저장하다,동사
5004,24656,straighten,"정돈하다,~을 똑바르게 하다",동사


In [72]:
df1.to_csv("Voca_MidHigh_example_sentencef1.csv", index=False, encoding='utf-8-sig')
df2.to_csv("Voca_MidHigh_example_sentencef2.csv", index=False, encoding='utf-8-sig')
df3.to_csv("Voca_MidHigh_example_sentencef3.csv", index=False, encoding='utf-8-sig')
df4.to_csv("Voca_MidHigh_example_sentencef4.csv", index=False, encoding='utf-8-sig')
df5.to_csv("Voca_MidHigh_example_sentencef5.csv", index=False, encoding='utf-8-sig')
df6.to_csv("Voca_MidHigh_example_sentencef6.csv", index=False, encoding='utf-8-sig')