In [2]:
import logging
import sys
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.node_parser.text import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core.query_engine import TransformQueryEngine
from IPython.display import display
from llama_index.vector_stores.singlestoredb import SingleStoreVectorStore
from llama_index.core.tools import QueryEngineTool, ToolMetadata, BaseTool
from llama_index.core.settings import Settings
from llama_index.core.callbacks import CallbackManager
from llama_index.core.agent.react.types import (
    ActionReasoningStep,
    ObservationReasoningStep,
    ResponseReasoningStep,
)
from llama_index.core.agent.react import ReActOutputParser

from llama_index.core.agent import Task, AgentChatResponse, ReActChatFormatter, QueryPipelineAgentWorker
from llama_index.core.query_pipeline import (
    AgentInputComponent,
    AgentFnComponent,
    CustomAgentComponent,
    QueryComponent,
    ToolRunnerComponent,
    InputComponent,
    Link,
    QueryPipeline
)
from llama_index.core.llms import MessageRole, ChatMessage, ChatResponse
from typing import Dict, Any, Optional, Tuple, List, Set, cast
from sqlalchemy import *
from pyvis.network import Network
from IPython.display import display, HTML
from llama_index.embeddings.openai import OpenAIEmbedding

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

callback_manager = CallbackManager()
Settings.callback_manager = callback_manager

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import os

host = os.getenv('HOST')
database = os.getenv('DATABASE')
password = os.getenv('PASSWORD')
user = os.getenv('USER')

In [4]:
from llama_index.vector_stores.singlestoredb import SingleStoreVectorStore

# 接続情報を明示的に設定
host = "svc-3482219c-a389-4079-b18b-d50662524e8a-shared-dml.aws-virginia-6.svc.singlestore.com"
port = "3333"
database = "db_hiroki_df9a3"
user = "hiroki-7544a"
password = password  # 実際のパスワードを入力

try:
    vector_store = SingleStoreVectorStore(
        table_name="embeddings",
        content_field="content",
        metadata_field="metadata",
        vector_field="vector",
        host=host,
        port=port,
        database=database,
        user=user,
        password=password,
    )
    print("Vector store created successfully")
except Exception as e:
    print(f"Error creating vector store: {e}")

Vector store created successfully


In [5]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

### embeddingモデル

In [6]:
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.embed_model = embed_model

In [29]:
pdf_directory = "/Users/user/Desktop/GenerativeAI_apps/third_finance_competition/validation/documents"
pdf_files = [file for file in os.listdir(pdf_directory) if file.endswith('.pdf')]

def generate_storage_dir(file_name):
    base_name = os.path.splitext(file_name)[0]  # Remove extension
    safe_name = base_name.replace(' ', '_').replace('&', 'and')
    return os.path.join(storage_base_dir, safe_name)

storage_base_dir = "./storage"
Settings.transformations = [SentenceSplitter(chunk_size=1024)]

all_indices_loaded = True

indices = {}

for pdf_file in pdf_files:
    try:
        storage_dir = generate_storage_dir(pdf_file)
        storage_context = StorageContext.from_defaults(persist_dir=storage_dir)
        index = load_index_from_storage(storage_context)
        indices[pdf_file] = index
    except Exception as e:
        print(f"Failed to load index for {pdf_file}: {e}")
        all_indices_loaded = False

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [9]:
indices

{'太陽誘電株式会社-統合報告書-2024.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x12e88afd0>,
 '東洋紡株式会社-サステナビリティ報告書-2024.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x132a19190>,
 '電通グループ-統合レポート-2023年.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x133415190>,
 '日本化薬グループ-サステナビリティレポート-2024.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x13317d910>,
 'ウエルシアホールディングス株式会社-統合報告書-2024.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x134bc3890>,
 '大和ハウスグループ-サステナビリティ報告書-2024.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x132b04890>,
 '花王-統合レポート-2024年  花王-サステナビリティレポート-2024年  花王-有価証券報告書-2024年  花王-コーポレート・ガバナンスに関する報告書-2024年  花王-決算短信-2024年.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x1305af610>,
 'ダイドーグループホールディングス-統合報告書-2024.pdf': <llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x1331d68d0>,
 'エクシオグループ-統合報告書-2024.pdf': <llama

In [None]:
# import fitz
# import pymupdf4llm
# from llama_index.core.schema import Document

# def read_pdf_with_fallback(input_path: str) -> str:
#     """PDFをテキストに変換（pymupdf4llmとfitzを使用）"""
#     try:
#         # まずpymupdf4llmで試行
#         print(f"pymupdf4llmで変換を試みます: {input_path}")
#         text = pymupdf4llm.to_markdown(input_path)
#         print("pymupdf4llmでの変換が成功しました")
#         return text
#     except Exception as e:
#         print(f"pymupdf4llmでの変換に失敗しました: {str(e)}")
#         print("fitzを使用して再試行します...")

#         try:
#             # fitzにフォールバック
#             doc = fitz.open(input_path)
#             text = ""
#             for page in doc:
#                 text += page.get_text() + "\n\n"
#             doc.close()

#             # 構造化
#             lines = text.splitlines()
#             structured_text = ""
#             paragraph = ""

#             for line in lines:
#                 line = line.strip()
#                 if line:
#                     paragraph += line + " "
#                 else:
#                     if paragraph:
#                         structured_text += paragraph.strip() + "\n\n"
#                         paragraph = ""

#             if paragraph:
#                 structured_text += paragraph.strip() + "\n\n"

#             print("fitzでの変換が成功しました")
#             return structured_text

#         except Exception as e:
#             print(f"fitzでの変換にも失敗しました: {str(e)}")
#             raise e

# # メインのインデックス作成コードの修正部分
# if not all_indices_loaded:
#     for pdf_file in pdf_files:
#         if pdf_file not in indices:
#             try:
#                 input_path = os.path.join(pdf_directory, pdf_file)

#                 # PDFをテキストに変換
#                 text = read_pdf_with_fallback(input_path)
#                 docs = [Document(text=text, metadata={"source": pdf_file})]

#                 text_parser = SentenceSplitter(chunk_size=1024)
#                 text_chunks = []
#                 doc_idxs = []
#                 for doc_idx, doc in enumerate(docs):
#                     cur_text_chunks = text_parser.split_text(doc.text)
#                     text_chunks.extend(cur_text_chunks)
#                     doc_idxs.extend([doc_idx] * len(cur_text_chunks))

#                 from llama_index.core.schema import TextNode
#                 nodes = []
#                 for idx, text_chunk in enumerate(text_chunks):
#                     node = TextNode(text=text_chunk)
#                     src_doc = docs[doc_idxs[idx]]
#                     node.metadata = src_doc.metadata
#                     nodes.append(node)

#                 vectorstoreindex = VectorStoreIndex(
#                     nodes=nodes,
#                     storage_context=storage_context,
#                     transformations=Settings.transformations,
#                 )
#                 index = vectorstoreindex.from_documents(docs)

#                 storage_dir = generate_storage_dir(pdf_file)
#                 index.storage_context.persist(persist_dir=storage_dir)

#                 indices[pdf_file] = index
#             except Exception as e:
#                 print(f"Failed to process {pdf_file}: {e}")

In [18]:
from llama_parse import LlamaParse
import nest_asyncio
nest_asyncio.apply()

storage_base_dir = "./llama_storage"
Settings.transformations = [SentenceSplitter(chunk_size=1024)]

all_indices_loaded = True
indices = {}
for pdf_file in pdf_files:
    try:
        storage_dir = generate_storage_dir(pdf_file)
        storage_context = StorageContext.from_defaults(persist_dir=storage_dir)
        index = load_index_from_storage(storage_context)
        indices[pdf_file] = index
    except Exception as e:
        print(f"Failed to load index for {pdf_file}: {e}")
        all_indices_loaded = False

if not all_indices_loaded:
    for pdf_file in pdf_files:
        if pdf_file not in indices:
            try:
                input_path = os.path.join(pdf_directory, pdf_file)

                # SimpleDirectoryReaderの代わりにLlamaParseを使用
                parser = LlamaParse(
                    result_type="markdown",
                    verbose=True,
                    num_workers=4,
                    parsing_instruction = "提供された文書はPDFである。1ページにはファイルのタイトル、2ページは目次となっており、それ以降のページは各ページごとの副題を持つ。資料は、テキスト、図、表、グラフ、イラストを含む。",
                )
                docs = parser.load_data([input_path])

                text_parser = SentenceSplitter(chunk_size=1024)
                text_chunks = []
                doc_idxs = []
                for doc_idx, doc in enumerate(docs):
                    cur_text_chunks = text_parser.split_text(doc.text)
                    text_chunks.extend(cur_text_chunks)
                    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

                from llama_index.core.schema import TextNode
                nodes = []
                for idx, text_chunk in enumerate(text_chunks):
                    node = TextNode(text=text_chunk)
                    src_doc = docs[doc_idxs[idx]]
                    node.metadata = src_doc.metadata
                    nodes.append(node)

                vectorstoreindex = VectorStoreIndex(nodes = nodes, storage_context=storage_context,transformations=Settings.transformations,)
                index = vectorstoreindex.from_documents(docs)

                storage_dir = generate_storage_dir(pdf_file)
                index.storage_context.persist(persist_dir=storage_dir)

                indices[pdf_file] = index
            except Exception as e:
                print(f"Failed to process {pdf_file}: {e}")

Failed to load index for 太陽誘電株式会社-統合報告書-2024.pdf: [Errno 2] No such file or directory: '/Users/user/Desktop/GenerativeAI_apps/third_finance_competition/src/llama_storage/太陽誘電株式会社-統合報告書-2024/docstore.json'
Failed to load index for 東洋紡株式会社-サステナビリティ報告書-2024.pdf: [Errno 2] No such file or directory: '/Users/user/Desktop/GenerativeAI_apps/third_finance_competition/src/llama_storage/東洋紡株式会社-サステナビリティ報告書-2024/docstore.json'
Failed to load index for 電通グループ-統合レポート-2023年.pdf: [Errno 2] No such file or directory: '/Users/user/Desktop/GenerativeAI_apps/third_finance_competition/src/llama_storage/電通グループ-統合レポート-2023年/docstore.json'
Failed to load index for 日本化薬グループ-サステナビリティレポート-2024.pdf: [Errno 2] No such file or directory: '/Users/user/Desktop/GenerativeAI_apps/third_finance_competition/src/llama_storage/日本化薬グループ-サステナビリティレポート-2024/docstore.json'
Failed to load index for ウエルシアホールディングス株式会社-統合報告書-2024.pdf: [Errno 2] No such file or directory: '/Users/user/Desktop/GenerativeAI_apps/third_finance_competi

Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/257e22c9-d772-4a10-a99c-1f16daf3a21e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/257e22c9-d772-4a10-a99c-1f16daf3a21e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/257e22c9-d772-4a10-a99c-1f16daf3a21e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/257e22c9-d772-4a10-a99c-1f16daf3a21e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/257e22c9-d772-4a10-a99c-1f16daf3a21e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/257e22c9-d772-4a10-a99c-1f16daf3a21e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [07:02<00:00, 422.86s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fee0aef7-d2ae-41ba-851e-be6550ce891e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fee0aef7-d2ae-41ba-851e-be6550ce891e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fee0aef7-d2ae-41ba-851e-be6550ce891e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fee0aef7-d2ae-41ba-851e-be6550ce891e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fee0aef7-d2ae-41ba-851e-be6550ce891e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/fee0aef7-d2ae-41ba-851e-be6550ce891e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [06:23<00:00, 383.22s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b40f5081-7d06-4301-a95e-b5e8d3a02c0e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b40f5081-7d06-4301-a95e-b5e8d3a02c0e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b40f5081-7d06-4301-a95e-b5e8d3a02c0e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b40f5081-7d06-4301-a95e-b5e8d3a02c0e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b40f5081-7d06-4301-a95e-b5e8d3a02c0e "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/b40f5081-7d06-4301-a95e-b5e8d3a02c0e "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [01:11<00:00, 71.93s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3d66f365-8182-4434-9267-01e904b3b05b "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3d66f365-8182-4434-9267-01e904b3b05b "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3d66f365-8182-4434-9267-01e904b3b05b "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3d66f365-8182-4434-9267-01e904b3b05b "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3d66f365-8182-4434-9267-01e904b3b05b "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/3d66f365-8182-4434-9267-01e904b3b05b "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [02:38<00:00, 158.50s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f23d08f3-d845-4e4a-8aeb-2dc70564846c "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f23d08f3-d845-4e4a-8aeb-2dc70564846c "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f23d08f3-d845-4e4a-8aeb-2dc70564846c "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f23d08f3-d845-4e4a-8aeb-2dc70564846c "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f23d08f3-d845-4e4a-8aeb-2dc70564846c "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/f23d08f3-d845-4e4a-8aeb-2dc70564846c "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [04:41<00:00, 281.40s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cde33688-3d42-417c-bac2-c5d6dcc15eb4 "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cde33688-3d42-417c-bac2-c5d6dcc15eb4 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cde33688-3d42-417c-bac2-c5d6dcc15eb4 "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cde33688-3d42-417c-bac2-c5d6dcc15eb4 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cde33688-3d42-417c-bac2-c5d6dcc15eb4 "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cde33688-3d42-417c-bac2-c5d6dcc15eb4 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [08:20<00:00, 500.98s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/221d75e5-9f1c-4d3f-aa30-3bf7764f034b "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/221d75e5-9f1c-4d3f-aa30-3bf7764f034b "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/221d75e5-9f1c-4d3f-aa30-3bf7764f034b "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/221d75e5-9f1c-4d3f-aa30-3bf7764f034b "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/221d75e5-9f1c-4d3f-aa30-3bf7764f034b "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/221d75e5-9f1c-4d3f-aa30-3bf7764f034b "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [01:31<00:00, 91.56s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cfad5f69-efe6-423d-adaf-3d007e265406 "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cfad5f69-efe6-423d-adaf-3d007e265406 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cfad5f69-efe6-423d-adaf-3d007e265406 "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cfad5f69-efe6-423d-adaf-3d007e265406 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cfad5f69-efe6-423d-adaf-3d007e265406 "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/cfad5f69-efe6-423d-adaf-3d007e265406 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [01:25<00:00, 85.42s/it]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/5a9c34c4-9da1-45b4-a8a8-cc3f6dcb0e9f "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/5a9c34c4-9da1-45b4-a8a8-cc3f6dcb0e9f "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/5a9c34c4-9da1-45b4-a8a8-cc3f6dcb0e9f "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/5a9c34c4-9da1-45b4-a8a8-cc3f6dcb0e9f "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/5a9c34c4-9da1-45b4-a8a8-cc3f6dcb0e9f "HTTP/1.1 200 OK"
HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/5a9c34c4-9da1-45b4-a8a8-cc3f6dcb0e9f "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.cloud.llamain

Parsing files: 100%|██████████| 1/1 [33:31<00:00, 2011.00s/it]


Error while parsing the file '/Users/user/Desktop/GenerativeAI_apps/third_finance_competition/validation/documents/エクシオグループ-統合報告書-2024.pdf': Timeout while parsing the file: 5a9c34c4-9da1-45b4-a8a8-cc3f6dcb0e9f


Parsing files:   0%|          | 0/1 [00:00<?, ?it/s]



Parsing files: 100%|██████████| 1/1 [01:28<00:00, 88.77s/it]

Error while parsing the file '/Users/user/Desktop/GenerativeAI_apps/third_finance_competition/validation/documents/大成温調-統合報告書-2024.pdf': 





In [19]:
vectorstoreindex = VectorStoreIndex(
                    nodes=nodes,
                    storage_context=storage_context,
                    transformations=Settings.transformations,
                )

### 保存したものを読み込むとき

In [30]:
from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex

# 保存されているディレクトリからStorageContextを読み込む
storage_context = StorageContext.from_defaults(
    persist_dir=storage_dir  # 以前保存した際のディレクトリパス
)

# VectorStoreIndexを作成
vectorstoreindex = load_index_from_storage(
    storage_context=storage_context
)

INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [31]:
llm = OpenAI(model="gpt-4o-mini")
Settings.llm = llm

In [32]:
query_engines={}

for pdf_name, index in indices.items():
    try:
        query_engine = index.as_query_engine(similarity_top_k=5)

        hyde = HyDEQueryTransform(include_original=True)

        hyde_query_engine = TransformQueryEngine(query_engine, hyde)

        query_engines[pdf_name] = hyde_query_engine

        print(f"HyDE query engine created for {pdf_name}")

    except Exception as e:
        print(f"An error occurred while creating query engine for {pdf_name}: {e}")

HyDE query engine created for 太陽誘電株式会社-統合報告書-2024.pdf
HyDE query engine created for 東洋紡株式会社-サステナビリティ報告書-2024.pdf
HyDE query engine created for 電通グループ-統合レポート-2023年.pdf
HyDE query engine created for 日本化薬グループ-サステナビリティレポート-2024.pdf
HyDE query engine created for ウエルシアホールディングス株式会社-統合報告書-2024.pdf
HyDE query engine created for 大和ハウスグループ-サステナビリティ報告書-2024.pdf
HyDE query engine created for 花王-統合レポート-2024年  花王-サステナビリティレポート-2024年  花王-有価証券報告書-2024年  花王-コーポレート・ガバナンスに関する報告書-2024年  花王-決算短信-2024年.pdf
HyDE query engine created for ダイドーグループホールディングス-統合報告書-2024.pdf
HyDE query engine created for エクシオグループ-統合報告書-2024.pdf
HyDE query engine created for 大成温調-統合報告書-2024.pdf


In [33]:
query_engine_tools = []

for pdf_name, hyde_query_engine in query_engines.items():
    try:
        description = "Provides comprehensive corporate information from integrated reports, including business strategy, financial performance, sustainability initiatives, and corporate governance. Use specific questions about company operations, ESG practices, or financial metrics as input to the tool."

        metadata = ToolMetadata(
            name=pdf_name.replace(".pdf", "").replace(" ", "_").lower(),
            description=description
        )

        tool = QueryEngineTool(
            query_engine=hyde_query_engine,
            metadata=metadata
        )

        query_engine_tools.append(tool)

        print(f"QueryEngineTool created for {pdf_name}")

    except Exception as e:
        print(f"An error occurred while creating QueryEngineTool for {pdf_name}: {e}")

QueryEngineTool created for 太陽誘電株式会社-統合報告書-2024.pdf
QueryEngineTool created for 東洋紡株式会社-サステナビリティ報告書-2024.pdf
QueryEngineTool created for 電通グループ-統合レポート-2023年.pdf
QueryEngineTool created for 日本化薬グループ-サステナビリティレポート-2024.pdf
QueryEngineTool created for ウエルシアホールディングス株式会社-統合報告書-2024.pdf
QueryEngineTool created for 大和ハウスグループ-サステナビリティ報告書-2024.pdf
QueryEngineTool created for 花王-統合レポート-2024年  花王-サステナビリティレポート-2024年  花王-有価証券報告書-2024年  花王-コーポレート・ガバナンスに関する報告書-2024年  花王-決算短信-2024年.pdf
QueryEngineTool created for ダイドーグループホールディングス-統合報告書-2024.pdf
QueryEngineTool created for エクシオグループ-統合報告書-2024.pdf
QueryEngineTool created for 大成温調-統合報告書-2024.pdf


In [34]:
def agent_input_fn(task: Task, state: Dict[str, Any]) -> Dict[str, Any]:
    """Agent input function.

    Returns:
        A Dictionary of output keys and values. If you are specifying
        src_key when defining links between this component and other
        components, make sure the src_key matches the specified output_key.

    """
    if "current_reasoning" not in state:
        state["current_reasoning"] = []
    reasoning_step = ObservationReasoningStep(observation=task.input)
    state["current_reasoning"].append(reasoning_step)
    return {"input": task.input}


agent_input_component = AgentInputComponent(fn=agent_input_fn)

In [35]:
def react_prompt_fn(
    task: Task, state: Dict[str, Any], input: str, tools: List[BaseTool]
) -> List[ChatMessage]:
    # 「思考」「行動」「観察」のサイクルで問題を解決する
    chat_formatter = ReActChatFormatter()
    return chat_formatter.format(
        # 使用可能なツール群
        tools,
        # チャット履歴の結合
        chat_history=task.memory.get() + state["memory"].get_all(),
        # 現在の推論状態
        current_reasoning=state["current_reasoning"],
    )


react_prompt_component = AgentFnComponent(
    fn=react_prompt_fn, partial_dict={"tools": query_engine_tools}
)

In [36]:
def parse_react_output_fn(
    task: Task, state: Dict[str, Any], chat_response: ChatResponse
):
    """Parse ReAct output into a reasoning step."""
    output_parser = ReActOutputParser()
    reasoning_step = output_parser.parse(chat_response.message.content)
    return {"done": reasoning_step.is_done, "reasoning_step": reasoning_step}


parse_react_output = AgentFnComponent(fn=parse_react_output_fn)


def run_tool_fn(
    task: Task, state: Dict[str, Any], reasoning_step: ActionReasoningStep
):
    """Run tool and process tool output."""
    tool_runner_component = ToolRunnerComponent(
        query_engine_tools, callback_manager=task.callback_manager
    )
    tool_output = tool_runner_component.run_component(
        tool_name=reasoning_step.action,
        tool_input=reasoning_step.action_input,
    )
    observation_step = ObservationReasoningStep(observation=str(tool_output))
    state["current_reasoning"].append(observation_step)

    return {"response_str": observation_step.get_content(), "is_done": False}


run_tool = AgentFnComponent(fn=run_tool_fn)


def process_response_fn(
    task: Task, state: Dict[str, Any], response_step: ResponseReasoningStep
):
    """Process response."""
    state["current_reasoning"].append(response_step)
    response_str = response_step.response
    state["memory"].put(ChatMessage(content=task.input, role=MessageRole.USER))
    state["memory"].put(
        ChatMessage(content=response_str, role=MessageRole.ASSISTANT)
    )

    return {"response_str": response_str, "is_done": True}


process_response = AgentFnComponent(fn=process_response_fn)


def process_agent_response_fn(
    task: Task, state: Dict[str, Any], response_dict: dict
):
    """Process agent response."""
    return (
        AgentChatResponse(response_dict["response_str"]),
        response_dict["is_done"],
    )


process_agent_response = AgentFnComponent(fn=process_agent_response_fn)

In [37]:
qp = QueryPipeline(verbose=True)

qp.add_modules(
    {
        "agent_input": agent_input_component,
        "react_prompt": react_prompt_component,
        "llm": llm,
        "react_output_parser": parse_react_output,
        "run_tool": run_tool,
        "process_response": process_response,
        "process_agent_response": process_agent_response,
    }
)

qp.add_chain(["agent_input", "react_prompt", "llm", "react_output_parser"])

qp.add_link(
    "react_output_parser",
    "run_tool",
    condition_fn=lambda x: not x["done"],
    input_fn=lambda x: x["reasoning_step"],
)
qp.add_link(
    "react_output_parser",
    "process_response",
    condition_fn=lambda x: x["done"],
    input_fn=lambda x: x["reasoning_step"],
)

qp.add_link("process_response", "process_agent_response")
qp.add_link("run_tool", "process_agent_response")

In [38]:
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(qp.clean_dag)

net.write_html("agent.html")

In [39]:
agent_worker = QueryPipelineAgentWorker(qp)
agent = agent_worker.as_agent(
    callback_manager=CallbackManager([]), verbose=True
)

In [44]:
query = "電通グループの社名が「株式会社電通」と改称されたのは何年ですか。"

task = agent.create_task(query)

step_output = agent.run_step(task.task_id)
print(step_output)

> Running step 961d8930-5798-489e-b9f9-b101518eb014. Step input: 電通グループの社名が「株式会社電通」と改称されたのは何年ですか。
[1;3;38;2;155;135;227m> Running module agent_input with input: 
state: {'sources': [], 'memory': ChatMemoryBuffer(chat_store=SimpleChatStore(store={}), chat_store_key='chat_history', token_limit=3000, tokenizer_fn=functools.partial(<bound method Encoding.encode of <Encod...
task: task_id='53b35152-0255-4a6b-afad-3a559810fbde' input='電通グループの社名が「株式会社電通」と改称されたのは何年ですか。' memory=ChatMemoryBuffer(chat_store=SimpleChatStore(store={}), chat_store_key='chat_history', token_limit=3000, t...

[0m[1;3;38;2;155;135;227m> Running module react_prompt with input: 
input: 電通グループの社名が「株式会社電通」と改称されたのは何年ですか。

[0m[1;3;38;2;155;135;227m> Running module llm with input: 
messages: [ChatMessage(role=<MessageRole.SYSTEM: 'system'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='You are designed to help with a variety of tasks, from answering questions to providi...

[0mINFO:httpx:HTTP Request:

In [34]:
step_output

TaskStepOutput(output=AgentChatResponse(response="Observation: {'output': ToolOutput(content='2020年度から2023年度までのリサイクル率は以下の通りです：\\n\\n- 2020年: 80.2%\\n- 2021年: 80.8%\\n- 2022年: 81.8%\\n- 2023年: 91.0%\\n\\nこれらのリサイクル率の合計は、80.2 + 80.8 + 81.8 + 91.0 = 333.8% です。\\n\\n平均を求めるために、合計を年数（4年）で割ります：\\n333.8 ÷ 4 = 83.45%\\n\\nしたがって、2020年度から2023年度までのリサイクル率の平均は83.45%となります。四捨五入すると、答えは83%です。', tool_name='日本化薬グループ-サステナビリティレポート-2024', raw_input={'input': '2020年度から2023年度までのリサイクル率の平均は何％か、小数第二位を四捨五入して答えよ。'}, raw_output=Response(response='2020年度から2023年度までのリサイクル率は以下の通りです：\\n\\n- 2020年: 80.2%\\n- 2021年: 80.8%\\n- 2022年: 81.8%\\n- 2023年: 91.0%\\n\\nこれらのリサイクル率の合計は、80.2 + 80.8 + 81.8 + 91.0 = 333.8% です。\\n\\n平均を求めるために、合計を年数（4年）で割ります：\\n333.8 ÷ 4 = 83.45%\\n\\nしたがって、2020年度から2023年度までのリサイクル率の平均は83.45%となります。四捨五入すると、答えは83%です。', source_nodes=[NodeWithScore(node=TextNode(id_='b0cd22a1-17b2-4c63-a4ce-3bc7abb8b83b', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={