In [7]:
from datasets import load_dataset
dataset = load_dataset("amphora/krx-sample-instructions", split = "train")

# 역색인

In [8]:
from kiwipiepy import Kiwi
from collections import defaultdict
import math

In [9]:
class InvertedIndex:
    def __init__(self):
        self.reset()

    def reset(self):
        self.index = defaultdict(dict)
        self.kiwi = Kiwi()
        self.document_lengths = {}
        self.total_documents = 0
        self.average_document_length = 0
        self.documents = {}

    def tokenize(self, text):
        return [token.form for token in self.kiwi.tokenize(text)]

    def add_document(self, doc_id, question, answer):
        tokens = self.tokenize(question)
        self.document_lengths[doc_id] = len(tokens)
        self.total_documents += 1
        self.documents[doc_id] = {'question': question, 'answer': answer}

        for token in set(tokens):
            if doc_id not in self.index[token]:
                self.index[token][doc_id] = 0
            self.index[token][doc_id] += tokens.count(token)

        self.average_document_length = sum(self.document_lengths.values()) / self.total_documents

    def calculate_bm25_score(self, query_tokens, doc_id):
        k1 = 1.5
        b = 0.75
        score = 0

        for token in query_tokens:
            if token not in self.index or doc_id not in self.index[token]:
                continue

            tf = self.index[token][doc_id]
            df = len(self.index[token])
            idf = math.log((self.total_documents - df + 0.5) / (df + 0.5) + 1)

            numerator = tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * self.document_lengths[doc_id] / self.average_document_length)
            score += idf * numerator / denominator

        return score

    def search(self, query, k=5):
        query_tokens = self.tokenize(query)
        scores = defaultdict(float)

        for token in query_tokens:
            if token in self.index:
                for doc_id in self.index[token]:
                    scores[doc_id] += self.calculate_bm25_score(query_tokens, doc_id)

        top_k = sorted(scores.items(), key=lambda x: x[1], reverse = True)[:k]
        return [(doc_id, score, self.documents[doc_id]) for doc_id, score in top_k]

In [10]:
# add docs
from tqdm import tqdm

index = InvertedIndex()
for idx, data in enumerate(tqdm(dataset)):
    question = data['prompt']
    answer = data['response']
    index.add_document(idx, question, answer)

100%|██████████| 25951/25951 [01:27<00:00, 296.43it/s]


# 비슷한 문제들을 참고해서 만들게 하자

In [11]:
import random

random.seed(42)
dataset = random.sample(list(dataset), k=1000)
dataset = dataset[:100]

In [12]:
import openai
from dotenv import load_dotenv
import os

dotenv_path = os.path.join(os.getcwd(), '.env')
load_dotenv(dotenv_path)

openai_key = os.getenv('OPENAI_TEAM_API_KEY')
client = openai.OpenAI(api_key=openai_key)

In [13]:
system_prompt = """
You are given two pairs of reference questions and reference answers.
Your role is a questioner who make a new question.
When making your questions, consider the following.
1. New Question must require choices such as 'Which is right', 'Which is not right', 'Which is most appropriate', and 'Which is not most appropriate'.
2. You have to make 5 choices, 1 answer choice and 4 wrong choices.
3. The choices must be generated in association with one of several keywords in the reference question and answer.
4. The wrong answer and the right answer are confused, but the wrong answer must be a clear wrong answer.
5. The choices does not deviate from the subject of the problem, but it must be different.
6. The choices sentence must be similar in length.
7. If a person is a financial expert, the person can solve the problem, but if the person is a beginner in financial knowledge, please make the problem with a difficulty that the person cannot solve because it is difficult.
8. Please don't create a problem that can be solved by reading other than the problem.
please write in Korean and you must write the answer on the last line.
"""

user_prompt ="""
### Reference
### Question 1: {}
### Answer 1: {}

### Question 2: {}
### Answer 2: {}

### New Question : 

"""


In [14]:
import random
from tqdm import tqdm

new_questions = []
for idx, data in enumerate(tqdm(dataset, total=len(dataset))):
    question = data['prompt']
    answer = data['response']
    system_msg = {'role':'system', 'content': system_prompt}

    similar_questions = index.search(question, k=2)

    user_msg = {'role':'user', 'content': user_prompt.format(
        similar_questions[0][2]['question'],
        similar_questions[0][2]['answer'],
        similar_questions[1][2]['question'],
        similar_questions[1][2]['answer'])}

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages = [system_msg, user_msg],
    )

    result = response.choices[0].message.content
    new_questions.append(result)
    
    if idx % 100 == 0:
        print(result)
        print("*"*100)


  0%|          | 0/100 [00:00<?, ?it/s]

  1%|          | 1/100 [00:40<1:06:27, 40.28s/it]

주어진 편미분 방정식이 주어졌을 때, $S_t$와 $F_t$의 관계를 이해하기 위해 어떤 접근 방법이 가장 적절한가? 다음 중 가장 적절한 선택은 무엇인가?

1. $S_t$에 대한 확률적 미분 방정식을 다시 쓰고 그 해를 분석한다.
2. $dW$의 성질을 무시하고 $F_t$를 단순히 $S_t$로 대체한다.
3. 각각의 미분 방정식에서 모든 변수 간의 관계를 동일하게 본다.
4. $F_t$의 확률적 동작과 $S_t$ 간의 상관관계를 무시한다.
5. $S_t$와 $F_t$의 stochastic 과정에 대한 명확한 법칙을 따르고 그를 바탕으로 비교한다.

정답은 5번입니다.
****************************************************************************************************


100%|██████████| 100/100 [1:14:39<00:00, 44.79s/it] 


# CSV 파일로 만들기

In [24]:
mcqa_data = []

for raw_question in new_questions:
    lines = [line.strip() for line in raw_question.split('\n') if line.strip() != '']
    question = '\n'.join(lines[:-1])
    answer = lines[-1]
    
    for i in range(1, 6):
        if str(i) in answer:
            answer = i
            break

    item = {"question": question, 'answer': answer}
    mcqa_data.append(item)

In [25]:
import pandas as pd
result_df = pd.DataFrame(mcqa_data)
result_df.to_csv('/root/KRX_LLM/data/krx_sample_mcqa_data1.csv', index=False)