# Load dataset from NarrativeQA

We use Narrative from Question Generation and RAG Evaluation

In [None]:
import pandas as pd
from Utils import *
data_loader = DatasetLoader()
from datasets import load_dataset
import os

vector_DB = VectorDatabase()
embedder = Embedder()
data_processor = DataProcessor(embedder=embedder, vectordatabase=vector_DB)

In [None]:
vector_DB.list_collections()

In [None]:
def extract_narrativeqa_text(split='train'):
    # 載入數據集
    dataset = load_dataset("deepmind/narrativeqa", split=split)
    
    # 用於存儲唯一文本的字典
    unique_summaries = {}
    unique_documents = {}
    
    total_summary_chars = 0
    total_document_chars = 0
    
    # 用於存儲問題和答案的列表
    questions = []
    answers = []
    
    # 從每個示例中提取文本
    for example in dataset:
        summary = example['document']['summary']['text']
        document = example['document']['text']
        metadata = example['document']['kind'] + "\\" + example['document']['summary']['title']
        
        # 只有當摘要和文檔都是唯一的時才添加到 df_doc
        if summary not in unique_summaries and document not in unique_documents:
            unique_summaries[summary] = metadata
            unique_documents[document] = metadata
            total_summary_chars += len(summary)
            total_document_chars += len(document)
        
        # 總是添加問題和答案到 df_qa
        questions.append(example['question']['text'])
        answers_text = ""
        for answer in example['answers']:
            answers_text += answer['text'] + ", "
        answers.append(answers_text)
            
    # 從字典創建列表
    summaries = list(unique_summaries.keys())
    documents = list(unique_documents.keys())
    metadata = [unique_summaries[s] for s in summaries]  # 將元數據與摘要對齊
    
    # 計算平均值
    num_examples = len(summaries)
    avg_summary_chars = total_summary_chars / num_examples if num_examples > 0 else 0
    avg_document_chars = total_document_chars / num_examples if num_examples > 0 else 0
    
    # 創建 df_doc DataFrame
    df_doc = pd.DataFrame({
        'summary': summaries,
        'document': documents,
        'metadata': metadata
    })
    
    # 創建 df_qa DataFrame
    df_qa = pd.DataFrame({
        'questions': questions,
        'ground_truths': answers,
        'answers': ['' for _ in range(len(questions))],
        'context': ['' for _ in range(len(questions))]
    })
    
    print(f'唯一文檔數量: {num_examples}')
    print(f'問答對數量: {len(df_qa)}')
    print(f'平均摘要長度: {avg_summary_chars:.2f} 字符')
    print(f'平均文檔長度: {avg_document_chars:.2f} 字符')
    
    return df_doc, df_qa

In [None]:
# df_doc, df_qa = extract_narrativeqa_text(split="train")
# df_doc.to_parquet(".parquet/narrative_qa_doc_full.parquet")
# df_qa.to_parquet(".parquet/narrative_qa_qa_full.parquet")
df_doc = pd.read_parquet(".parquet/narrative_qa_doc_full.parquet")
df_qa = pd.read_parquet(".parquet/narrative_qa_qa_full.parquet")

df_doc_sample = df_doc.sample(frac=0.01, random_state=42)
# df_qa_sample = df_qa.sample(frac=0.05, random_state=42)
df_doc_sample.to_parquet(".parquet/narrative_qa_doc_sample_11.parquet")
# df_qa_sample.to_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
# df_doc_sample

In [None]:
# load parquet
df_doc_sample = pd.read_parquet(".parquet/narrative_qa_doc_sample_11.parquet")



# Transform the dataframe into .txts

In [None]:

import re
from bs4 import BeautifulSoup
import unicodedata

def preprocess_content(content: str) -> str:
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()

    # unify to NFKC normalization form
    text = unicodedata.normalize('NFKC', text)

    # remove url
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # remove extra whitespace
    text = re.sub(r'[ \t]+', ' ', text).strip()

    # remove special characters, but keep some punctuation
    text = re.sub(r'[^\w\s.,!?;:()"-]', '', text)

    # unify quotes
    text = text.replace('"', '"').replace('"', '"')

    # remove consecutive punctuation
    text = re.sub(r'([.,!?;:])\1+', r'\1', text)

    # ensure there is appropriate whitespace between sentences
    text = re.sub(r'([.,!?;:])\s*', r'\1 ', text)

    return text.strip()

def write_text_to_files_by_metadata(df):
    # Ensure the dataframe has the required columns
    if not all(col in df.columns for col in ['document', 'metadata']):
        raise ValueError("Dataframe must contain 'document' and 'metadata' columns")

    os.makedirs(".txt/", exist_ok=True)
    
    # Dictionary to keep track of file handles
    file_handles = {}

    try:
        for _, row in df.iterrows():
            metadata = row['metadata']
            metadata = metadata.replace(" ", "_").replace("/", "_").replace("\\", "_").replace(":", "_").replace("\"", "")
            document = preprocess_content(row['document'])

            # Create or get file handle
            if metadata not in file_handles:
                filename = f".txt/{metadata}.txt"
                file_handles[metadata] = open(filename, 'a', encoding='utf-8')

            # Write document to file
            file_handles[metadata].write(document + "\n\n")  # Add two newlines for separation

    finally:
        # Close all file handles
        for handle in file_handles.values():
            handle.close()

    print(f"Files created: {', '.join(f'{metadata}.txt' for metadata in file_handles.keys())}")

In [None]:
write_text_to_files_by_metadata(df_doc_sample)


# Embedder into Milvus (GPU) for txts

In [None]:
data_processor.directory_files_process("narrative_qa_full_gpu", ".txt/", True, True)

# Import GRAPH RAG data to neo4j

In [None]:
knowledge_DB = KnowledgeGraphDatabase()

In [None]:
knowledge_DB.transform_graph_rag_to_neo4j(datapath="../graph_rag_sample/output/20240906-153334/artifacts")

In [None]:
retriever = Retriever()
retriever.global_retrieve(0)

# Test Modular RAG

In [2]:
import pandas as pd
rag_evaluation_dataset = pd.read_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
dataset_queries = rag_evaluation_dataset["questions"].tolist()[:5:]
print(dataset_queries)
# print(vector_DB.list_collections())
answer = rag_evaluation_dataset["ground_truths"].tolist()[:5:]
print(answer)


['During the last decade what kind of life has Alexis Paulvitch lived?', 'Where does the fifth incarnation take place?', 'What is the importance of the map in this story?', 'Who begins a musical career?', 'What does Mercury peresuade Clotho to do?']
['A life of abuse and disease among tribal people. A life of abuse and disease. ', 'New York New York ', 'The map shows the location of Luke Skywalker. It holds the location of Luke ', 'Nat Nat ', 'Kill the emperor kill the emperor ']


In [3]:
from Module import *
from Config.output_pydantic import *
from langchain_core.runnables.config import RunnableConfig

config = RunnableConfig(recursion_limit=100000)
workflow = WorkFlowModularHybridRAG()

results = workflow.graph.invoke({
    "specific_collection": "narrative_qa_standard_gpu",
    "dataset_queries": dataset_queries,
}, config=config)

for result in results:
    print(result)


2024-09-14 21:11:55,439 - 140595937678848 - milvus_client.py-milvus_client:658 - DEBUG: Created new connection using: a63e8786d6fc4053ae4d56b77479d096


Initializing sparse embedder...
Embedder initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
GraphDatabase initialized.
Retriever initialized
Agents initialized (with model: "gpt-4o-mini" and temperature: "0.1")
Tasks initialized
MultiAgent RAG System initialized
index =  0
total =  5
[1m[95m [2024-09-14 21:11:55][DEBUG]: == Working Agent: Query Classifier[00m
[1m[95m [2024-09-14 21:11:55][INFO]: == Starting Task: 
Analyze the following user query and determine if it requires information retrieval to be answered accurately, while also evaluating its global or local scope:

User Query: "During the last decade what kind of life has Alexis Paulvitch lived?"

Your task is to:
1. Classify this query as either requiring retrieval or not (Boolean value)
2. Evaluate the query's domain range score (Integer value from 0 to 100)
3. Provide a brief justification for your decision (String value)
4. Pick up the most relevant keywords or entities



[1m[92m [2024-09-14 21:11:57][DEBUG]: == [Query Classifier] Task output: needs_retrieval=True domain_range_score=80 justification='The query asks for specific information about the life of a particular individual, Alexis Paulvitch, over the last decade, which likely requires external information retrieval to provide an accurate and detailed response.' relevant_keywords=['life', 'Alexis Paulvitch', 'last decade']

[00m
[1m[95m [2024-09-14 21:11:58][DEBUG]: == Working Agent: Reranker[00m
[1m[95m [2024-09-14 21:11:58][INFO]: == Starting Task: 
Your task is to evaluate each community's relevance to the user's query or sub-queries relevant to the user's query.
User Query: "During the last decade what kind of life has Alexis Paulvitch lived?"
And the sub-queries: "[]"

Your specific responsibilities are:
1. Compare each community to the user's query and sub-queries.
2. Assign a relevance score to each community based on how well it matches the user's query and sub-queries from 0 to 1



[1m[92m [2024-09-14 21:11:59][DEBUG]: == [Reranker] Task output: relevant_scores=[0, 0, 0, 0, 0, 0, 0]

[00m
[1m[95m [2024-09-14 21:11:59][DEBUG]: == Working Agent: Topic Searcher[00m
[1m[95m [2024-09-14 21:11:59][INFO]: == Starting Task: 
You have received multiple pieces of community information related to a user query or sub-queries decomposed from the user query by descending relevance scores.

----------------User Query----------------
During the last decade what kind of life has Alexis Paulvitch lived?

----------------Sub-queries----------------
[]

Your task is to analyze this information and help prepare for a vector database search to answer the user's query.

Follow these steps:
0. If no community information is provided, return 2 empty list.
1. Carefully read and analyze all provided community information.
2. Summarize the key points from this information and from the user query into concise community summaries.
3. Based on these summaries, imagine what relevant doc

In [3]:
results["all_results"]

['Over the last decade, Alexis Paulvitch has lived a life characterized by extreme suffering, degradation, and emotional turmoil. Initially fleeing into the jungle to escape the wrath of Tarzan, he fell into the hands of a savage cannibal tribe. This marked the beginning of a harrowing existence where he became the target of relentless abuse, beaten and tortured by the tribe\'s members. His physical state deteriorated significantly; he suffered from recurrent fevers, smallpox, and the visible scars of his torment, making him appear much older than his actual age. \n\nDespite his suffering, Paulvitch\'s mental state is equally tragic. He is consumed by a "dull hatred" towards those who wronged him, particularly his former associate Rokoff, who led him into this nightmare. His thoughts are filled with resentment towards the world around him, including law enforcement and societal order, reflecting a profound emotional disconnection and despair. \n\nAfter a decade of misery, he was rescue

In [4]:
answer

['A life of abuse and disease among tribal people. A life of abuse and disease. ',
 'New York New York ',
 'The map shows the location of Luke Skywalker. It holds the location of Luke ',
 'Nat Nat ',
 'Kill the emperor kill the emperor ']

In [None]:
from Module import *
from Config.output_pydantic import *
workflow = WorkFlowModularHybridRAG_Unit_Function_Test()

results = workflow.graph.invoke({
    "specific_collection": "narrative_qa_full_gpu",
    "user_query": "What is the main topic of this dataset?",
})

for result in results:
    print(result)

In [None]:

# Test out Retriever for local retriever and global retriever

# Test out Retriever for local retriever and global retriever

In [None]:
from Utils import *
retriever = Retriever()
knowledge_DB = KnowledgeGraphDatabase()
# knowledge_DB.create_entity_vector_index()
# knowledge_DB.create_community_weight()
global_result = retriever.global_retrieve(0)
local_result = retriever.local_retrieve(["What is the meaning of life"])

In [None]:
local_result

In [None]:
global_result["communities"]

In [None]:
import Config.constants as const
import json
from MultiAgent import *
from Utils import *

retriever = Retriever()
multi_agent = MultiAgent_RAG()
# all_communities = retriever.global_retrieve(0)["communities"]

# batches = []
# for i in range(0, len(all_communities), const.NODE_BATCH_SIZE):
#     batch_communities = all_communities[i:i + const.NODE_BATCH_SIZE]
#     batches.append({
#         "user_query": "What is the meaning of life",
#         "sub_queries": [],
#         "batch_communities": batch_communities,
#         "batch_size": len(batch_communities),
#     })


# all_scores = multi_agent.topic_reranking_run_batch_async(node_batch_inputs=batches).relevant_scores
# print(all_scores)
# print(len(all_scores))
# print(len(all_communities))

multi_agent.user_query_classification_run(user_query="Why does the author choose to use first-person point of view in this article?")



