llamaindexを用いた検索結果をプロンプトに挿入するのかを判断させる言語モデルの作成のためのファインチューニング用データを用意するためのソースコード

In [None]:
import os
import openai

title = "NoGame"

# 環境変数の設定
os.environ["OPENAI_API_KEY"] = "-------伏字-------"
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
from llama_index import SimpleDirectoryReader

# ドキュメントの読み込み
documents = SimpleDirectoryReader(
    input_files=["row_anime_voices\scenes_summary.txt"]
).load_data()

In [None]:
from llama_index.text_splitter import SentenceSplitter
from llama_index.node_parser import SimpleNodeParser

# ノードパーサーの準備
text_splitter = SentenceSplitter(
    chunk_overlap = 0,
    paragraph_separator="[SEP]",
)
node_parser = SimpleNodeParser.from_defaults(
    text_splitter=text_splitter
)

In [None]:
from llama_index.schema import MetadataMode
import json

# ドキュメントをコーパスに変換
nodes = node_parser.get_nodes_from_documents(documents)
corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}

# コーパスの保存
with open(f"RAG/{title}_summary_corpus.json", "w+",encoding="utf-8") as f:
    json.dump(corpus, f, ensure_ascii=False)


In [None]:
# 既にファイル作成済みの場合はこっち
import json

with open(f"RAG/{title}_summary_corpus.json","r") as f:
    corpus = json.load(f)
    

In [None]:
import re
import uuid
from llama_index.llms import OpenAI
from tqdm.notebook import tqdm

# 合成データの作成関数
def generate_queries(
    corpus,
    num_questions_per_chunk=10,
    prompt_template=None,
    verbose=False,
):
    llm = OpenAI(model="gpt-3.5-turbo")

    prompt_template = prompt_template or """\
    文脈は以下のとおりです。

    ---------------------
    {context_str}
    ---------------------

    あなたは教師です。 あなたの仕事は、試験問題を作成することです。
    日本のアニメ作品に文脈から重要な事実を捉える{num_questions_per_chunk} 個の質問を、以下の条件で作成します。
    - 質問は必ず提供された文脈に限定
    - 質問は必ず日本語で記述
    - 代名詞は絶対に使用しない
    - 質問は文書全体にわたって本質的に多様である必要がある
    """

    queries = {}
    queries_relevant_docs = {}
    for node_id, text in tqdm(corpus.items()):
        query = prompt_template.format(context_str=text, num_questions_per_chunk=num_questions_per_chunk)
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            queries_relevant_docs[question_id] = [node_id]
    return queries, queries_relevant_docs

In [None]:
# 合成データの作成
queries, queries_relevant_docs = generate_queries(corpus,num_questions_per_chunk=20)

print(queries)

In [None]:
with open(r"final_character_DB\NoGame_queries.json","w") as f:
    json.dump(queries,f, ensure_ascii=False)

with open(r"final_character_DB\NoGame_queries_relevant_docs.json","w") as f:
    json.dump(queries_relevant_docs,f, ensure_ascii=False)

In [None]:
print(corpus)

In [None]:
queries_list=[]
queries_id_list=[]

for key, value in queries.items():
    queries_list.append(value)
    queries_id_list.append(key)

In [None]:
import random
import re

queries_id_real_list = queries_id_list[:]
shuffled_queries_id_list = []

for i in range(len(queries_id_real_list)):
    if (i%2==1):
        shuffled_queries_id_list.append(random.choice(queries_id_list))
    else:
        shuffled_queries_id_list.append(queries_id_list[i])


with open(r"train_dataset\train_dataset_for_usegate.csv","w") as f:
    for i in range(len(queries_list)):
        if (shuffled_queries_id_list[i] == queries_id_real_list[i]):
            answer = corpus[queries_relevant_docs[shuffled_queries_id_list[i]][0]]
            answer = answer.replace("\n","")
            answer = answer.replace(",","、")
            query = queries_list[i]
            query = query.replace(",","、")
            f.write(query+","+answer+",0\n")
        else:
            answer = corpus[queries_relevant_docs[shuffled_queries_id_list[i]][0]]
            answer = answer.replace("\n","")
            answer = answer.replace(",","、")
            query = queries_list[i]
            query = query.replace(",","、")
            f.write(query+","+answer+",1\n")
