In [None]:
import os
file_dir = r"..\llm-20-questions"
file_path = file_dir + r"\overview.md"
output_dir = file_dir + "_jp"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
output_path = output_dir + r"\overview.md"

In [None]:
from calendar import c
from urllib import response
import nbformat
import time
from tqdm import tqdm

class ResourceExhausted(Exception):
    pass
from prompt_toolkit import prompt
os.environ["GEMINI_API_KEY"] = "YOUR_API_KEY"
import google.generativeai as genai
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
import re
from google.generativeai.types import HarmBlockThreshold, HarmCategory

generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 64,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}

system_instruction = """あなたはプロの翻訳家でかつデータサイエンティストです。ユーザーから送られてくるkaggleのコンペティションの概要説明を翻訳するのがあなたの役割です。ただし、日本語に翻訳する際は単に直訳するのではなく、日本語として自然な文章になるよう心がけてください。"""

model = genai.GenerativeModel(
    model_name="gemini-1.5-pro",
    generation_config=generation_config,
    system_instruction=system_instruction,
    safety_settings={
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        }
)
def modify_filename(path, new_suffix="_with_summary"):
    # ディレクトリとファイル名を分割
    directory, filename = os.path.split(path)
    
    # ファイル名と拡張子を分割
    name, ext = os.path.splitext(filename)
    
    # 新しいファイル名を作成
    new_filename = f"{name}{new_suffix}{ext}"
    
    # 新しいフルパスを作成
    new_path = os.path.join(directory, new_filename)
    
    return new_path

def split_text_by_tokens(file_path, target_tokens=4000):

    # ファイルの読み込み
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    chunks = []
    current_chunk = ""
    current_tokens = 0
    model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        }
    )
    for line in content.split('\n'):
        # 空の行をスキップ
        if not line.strip():
            continue
        line_tokens = model.count_tokens(line).total_tokens
        
        if current_tokens + line_tokens > target_tokens and current_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = ""
            current_tokens = 0

        current_chunk += line + '\n'
        current_tokens += line_tokens

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks
#リクエストの処理
def send_request(chat_session,prompt):
    try:
        response = chat_session.send_message(prompt)
    except ResourceExhausted as e:
        print("Resource exhausted. Waiting for 1 minute before retrying...")
        for i in tqdm(range(60), desc="Waiting", unit="second"):
            time.sleep(1)
        response = chat_session.send_message(prompt)
    except Exception as e:
        print(f"An error occurred: {e}")
        raise
    return response

# 要約を生成
def summarize(path):
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()

    model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        }
    )
    chat_session = model.start_chat(history=[])
    total_tokens = 0
    prompt_summary = "以下に示すコンペの概要説明を読んで要約を生成してください。また、どのような問題に取り組むのか、簡単な具体例を入れて分かりやすく説明してください。。\n\n コンペの概要 :\n"+text
    prompt_know = "以下に示すコンペの概要説明を読んで、機械学習・深層学習の初心者がつまずきそうな専門用語の簡単な解説を列挙してください。ただし、初心者なだけで大学の学部でやるようなレベルの機械学習・深層学習の知識はあるので簡単なものや有名なものは説明不要です。ある程度マイナーなものや、実務を経験していないと馴染みのないもの、このコンペ特有のドメイン知識等に焦点を当ててください。\n\n コンペの概要 :\n"+text
    total_tokens += model.count_tokens(prompt_summary).total_tokens
    response = send_request(chat_session,prompt_summary)
    summary = response.text
    summary = "# 要約 \n"+summary+"\n\n---\n"
    total_tokens += model.count_tokens(prompt_summary).total_tokens
    response = send_request(chat_session,prompt_know)
    known = response.text
    known = "# 用語概説 \n"+known+"\n\n---\n"
    # 要約を既存のnotebookの先頭にマークダウンセルとして追加
    text = summary + known + text
    output_path = modify_filename(path)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)

    return summary

# 使用例

text_chunks = split_text_by_tokens(file_path)


In [None]:
history = []
responses = []
for i, chunk in enumerate(text_chunks):
    chat_session = model.start_chat(history=history)
    response = send_request(chat_session,chunk)
    user_history = {"role": "user", "parts": chunk}
    model_history = {"role": "model", "parts": response.text}
    history.append(user_history)
    history.append(model_history)
    responses.append(response.text)
    print(f"Chunk {i+1} response:")

In [None]:
responses

In [None]:
#responsesを結合してファイルに書き込む
with open(output_path, 'w', encoding='utf-8') as file:
    for response in responses:
        file.write(response + '\n')

In [None]:
summarize(output_path)