# Load dataset from NarrativeQA

We use Narrative from Question Generation and RAG Evaluation

In [None]:
import pandas as pd
from Utils import *
data_loader = DatasetLoader()
from datasets import load_dataset

vector_DB = VectorDatabase()
embedder = Embedder()
data_processor = DataProcessor(embedder=embedder, vectordatabase=vector_DB)

In [None]:
def extract_narrativeqa_text(split='train'):
    # 載入數據集
    dataset = load_dataset("deepmind/narrativeqa", split=split)
    
    # 用於存儲唯一文本的字典
    unique_summaries = {}
    unique_documents = {}
    
    total_summary_chars = 0
    total_document_chars = 0
    
    # 用於存儲問題和答案的列表
    questions = []
    answers = []
    
    # 從每個示例中提取文本
    for example in dataset:
        summary = example['document']['summary']['text']
        document = example['document']['text']
        metadata = example['document']['kind'] + "\\" + example['document']['summary']['title']
        
        # 只有當摘要和文檔都是唯一的時才添加到 df_doc
        if summary not in unique_summaries and document not in unique_documents:
            unique_summaries[summary] = metadata
            unique_documents[document] = metadata
            total_summary_chars += len(summary)
            total_document_chars += len(document)
        
        # 總是添加問題和答案到 df_qa
        questions.append(example['question']['text'])
        answers_text = ""
        for answer in example['answers']:
            answers_text += answer['text'] + " "
        answers.append(answers_text)
            
    # 從字典創建列表
    summaries = list(unique_summaries.keys())
    documents = list(unique_documents.keys())
    metadata = [unique_summaries[s] for s in summaries]  # 將元數據與摘要對齊
    
    # 計算平均值
    num_examples = len(summaries)
    avg_summary_chars = total_summary_chars / num_examples if num_examples > 0 else 0
    avg_document_chars = total_document_chars / num_examples if num_examples > 0 else 0
    
    # 創建 df_doc DataFrame
    df_doc = pd.DataFrame({
        'summary': summaries,
        'document': documents,
        'metadata': metadata
    })
    
    # 創建 df_qa DataFrame
    df_qa = pd.DataFrame({
        'questions': questions,
        'ground_truths': answers,
        'answers': ['' for _ in range(len(questions))],
        'context': ['' for _ in range(len(questions))]
    })
    
    print(f'唯一文檔數量: {num_examples}')
    print(f'問答對數量: {len(df_qa)}')
    print(f'平均摘要長度: {avg_summary_chars:.2f} 字符')
    print(f'平均文檔長度: {avg_document_chars:.2f} 字符')
    
    return df_doc, df_qa

In [None]:
df_doc, df_qa = extract_narrativeqa_text(split="train")
df_doc_sample = df_doc.sample(frac=0.01)
df_qa_sample = df_qa.sample(frac=0.01)
df_doc_sample.to_parquet(".parquet/narrative_qa_doc_sample_11.parquet")
df_qa_sample.to_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
df_doc_sample

In [None]:
# load parquet
df_doc_sample = pd.read_parquet(".parquet/narrative_qa_doc_sample_11.parquet")
df_qa_sample = pd.read_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
df_qa_sample



# Transform the dataframe into .txts

In [None]:

import re
from bs4 import BeautifulSoup
import unicodedata

def preprocess_content(content: str) -> str:
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()

    # unify to NFKC normalization form
    text = unicodedata.normalize('NFKC', text)

    # remove url
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # remove extra whitespace
    text = re.sub(r'[ \t]+', ' ', text).strip()

    # remove special characters, but keep some punctuation
    text = re.sub(r'[^\w\s.,!?;:()"-]', '', text)

    # unify quotes
    text = text.replace('"', '"').replace('"', '"')

    # remove consecutive punctuation
    text = re.sub(r'([.,!?;:])\1+', r'\1', text)

    # ensure there is appropriate whitespace between sentences
    text = re.sub(r'([.,!?;:])\s*', r'\1 ', text)

    return text.strip()

def write_text_to_files_by_metadata(df):
    # Ensure the dataframe has the required columns
    if not all(col in df.columns for col in ['document', 'metadata']):
        raise ValueError("Dataframe must contain 'document' and 'metadata' columns")

    # Dictionary to keep track of file handles
    file_handles = {}

    try:
        for _, row in df.iterrows():
            metadata = row['metadata']
            document = preprocess_content(row['document'])

            # Create or get file handle
            if metadata not in file_handles:
                filename = f".txt/{metadata}.txt"
                file_handles[metadata] = open(filename, 'a', encoding='utf-8')

            # Write document to file
            file_handles[metadata].write(document + "\n\n")  # Add two newlines for separation

    finally:
        # Close all file handles
        for handle in file_handles.values():
            handle.close()

    print(f"Files created: {', '.join(f'{metadata}.txt' for metadata in file_handles.keys())}")

In [None]:
write_text_to_files_by_metadata(df_doc_sample)

# Embedder into Milvus (GPU) for txts

In [None]:
data_processor.directory_files_process("narrative_qa_sample_gpu", ".txt/", True, True)

# Import GRAPH RAG data to neo4j

In [None]:
knowledge_DB = KnowledgeGraphDatabase()

In [None]:
knowledge_DB.transform_graph_rag_to_neo4j(datapath="../graph_rag_sample/output/20240906-153334/artifacts")

In [None]:
retriever = Retriever()
retriever.global_retrieve(0)

# Test Modular RAG

In [None]:
import pandas as pd
rag_evaluation_dataset = pd.read_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
queries = rag_evaluation_dataset["questions"].tolist()[:2]
print(queries)
# print(vector_DB.list_collections())




In [1]:
from Module import *
from Config.output_pydantic import *
workflow = WorkFlowModularHybridRAG()

results = workflow.graph.stream({
    "repeat_times": 0,
    "specific_collection": "narrative_qa_standard_gpu",
    "user_query": "What is the main topic of this dataset?",
    "all_results": [],
    "all_contexts": [[]],
})

for result in results:
    print(result)

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed
2024-09-11 14:32:08,068 - 137473468536320 - milvus_client.py-milvus_client:658 - DEBUG: Created new connection using: 300cfc5147644c23a064d29039cc2c6a


Initializing sparse embedder...
Embedder initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
GraphDatabase initialized.
Retriever initialized
Agents initialized (with model: "gpt-4o-mini" and temperature: "0.1")
Tasks initialized
MultiAgent RAG System initialized
[1m[95m [2024-09-11 14:32:08][DEBUG]: == Working Agent: Query Classifier[00m
[1m[95m [2024-09-11 14:32:08][INFO]: == Starting Task: 
Analyze the following user query and determine if it requires information retrieval to be answered accurately, while also evaluating its global or local scope:

User Query: "What is the main topic of this dataset?"

Your task is to:
1. Classify this query as either requiring retrieval or not
2. Evaluate the query's domain range score

Consider the following guidelines:

1. Queries that typically require retrieval:
   - Specific factual information (e.g., historical dates, statistics, current events)
   - Detailed explanations of complex topic



[1m[92m [2024-09-11 14:32:11][DEBUG]: == [Query Classifier] Task output: needs_retrieval=True domain_range_score=71 justification="The query 'What is the main topic of this dataset?' requires information retrieval because it pertains to a specific dataset, which likely contains unique content that cannot be answered with general knowledge. The scope is broad, as understanding the main topic of a dataset typically involves analyzing its contents, which may vary significantly across different datasets. Therefore, it falls into the category of requiring detailed factual information and integration of knowledge from various sources."

[00m
{'user_query_classification_node': {'user_query_classification_result': UserQueryClassificationResult(needs_retrieval=True, domain_range_score=71, justification="The query 'What is the main topic of this dataset?' requires information retrieval because it pertains to a specific dataset, which likely contains unique content that cannot be answered with



[1m[92m [2024-09-11 14:32:15][DEBUG]: == [Reranker] Task output: relevant_scores=[40, 30, 50, 60]

[00m
[1m[95m [2024-09-11 14:32:15][DEBUG]: == Working Agent: Topic Searcher[00m
[1m[95m [2024-09-11 14:32:15][INFO]: == Starting Task: 
You have received multiple pieces of community information related to a user query or sub-queries decomposed from the user query by descending relevance scores.

----------------User Query----------------
What is the main topic of this dataset?

----------------Sub-queries----------------
[]

Your task is to analyze this information and help prepare for a vector database search to answer the user's query.

Follow these steps:
0. If no community information is provided, return 2 empty list.
1. Carefully read and analyze all provided community information.
2. Summarize the key points from this information and from the user query into concise community summaries.
3. Based on these summaries, imagine what relevant document chunks might exist in a vect



[1m[95m [2024-09-11 14:33:15][DEBUG]: == Working Agent: Reranker[00m
[1m[95m [2024-09-11 14:33:15][INFO]: == Starting Task: 
Your task is to evaluate each retrieved data item's relevance to the user's query or sub-queries relevant to the user's query.
-----User Query-----
"What is the main topic of this dataset?"

-----Sub-queries-----
"[]"

Your specific responsibilities are:
1. Compare each data item to the user's query and sub-queries.
2. Assign a relevance score to each data item based on how well it matches the user's query from 0 to 100.
   - Higher scores indicate better relevance.
3. Create a list of these relevance scores, don't include any other information.
4. CRITICAL: Ensure the number of scores in your output list EXACTLY matches the number of data items :5 in the input.

You will receive a list of 5 data items. 
----------------batch_retrieved_data----------------
[{'content': "<b>\t\t\t\tGEORGE\n</b>\t\tYes, he's sitting right there at the end\n\t\tof the table.\n\



[1m[92m [2024-09-11 14:33:20][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 20, 30, 40, 50]

[00m
[1m[95m [2024-09-11 14:33:21][DEBUG]: == Working Agent: Information Organizer[00m
[1m[95m [2024-09-11 14:33:21][INFO]: == Starting Task: 
Input:
3. User Query: What is the main topic of this dataset?
4. Sub-queries relevant to the user's query: []

Your tasks:
1. Carefully review and categorize the retrieved data and topic information.
2. Organize the information into coherent themes or topics, even if the original data is fragmented.
3. Preserve the original language, especially modal verbs like "shall", "may", "must", etc., that indicate levels of certainty or possibility.
4. Identify and highlight any connections or contradictions between different pieces of information.
5. Structure the data in a way that facilitates easy understanding and future use, without losing important details or nuances.
6. Include relevant metadata, organizing it alongside the corresponding 



[32;1m[1;3mI now can give a great answer
Final Answer: 

**1. Themes and Topics:**

**A. Drug Trade and Criminality**
- **George's Involvement in Drug Smuggling:**
  - "I was arrested in Chicago with six hundred and sixty pounds of grass. I think that qualifies me."
  - "The first year we made about 100 million dollars between us. It was an expensive operation."
  - "He was very anti-government. He talked about revolution, forming his own country or island, he was looking for power as well as money."
  - "You failed because you had the wrong dream." - Diego to George, indicating the moral complexities of their choices.

**B. Familial Relationships and Redemption**
- **George's Relationship with Family:**
  - "I want my kid out of protective custody. Now. No fucking around."
  - "His mother's disappointment in his criminal lifestyle creates a rift, while his longing to reconnect with Kristina illustrates his desire for redemption."
- **Emotional Conflict:**
  - "George's relationships

# Test out Retriever for local retriever and global retriever

In [1]:
from Utils import *
retriever = Retriever()
knowledge_DB = KnowledgeGraphDatabase()
# knowledge_DB.create_entity_vector_index()
# knowledge_DB.create_community_weight()
global_result = retriever.global_retrieve(0)
local_result = retriever.local_retrieve(["What is the meaning of life"])

Initializing sparse embedder...
Embedder initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
GraphDatabase initialized.
Retriever initialized
GraphDatabase initialized.


In [2]:
local_result

{'text_mapping': [' with dirt. Craig\n looks up at her. CRAIG\n You were him last night, werent you? LOTTE\n (quietly)\n Yes. CRAIG\n And he was with her. LOTTE\n We love her, Craig. Im sorry. CRAIG\n We? LOTTE\n Me and John. CRAIG\n Dont forget me. LOTTE\n Well, you have the Maxine action\n figure to play with. Craig looks down at his coffee. LOTTE (CONTD)\n Im sorry. That was nasty. CRAIG\n Life is confusing, isnt it? LOTTE\n Sometimes were forced to make\n hard decisions. (beat)\n Id like for us to stay together, Craig. You know, platonically, if thats possible. I truly value\n our friendship. CRAIG\n I feel that somehow my parents never\n prepared me to make this particular\n decision. Not that I blame them. How\n could they know? Todays world is so\n complicated. (beat)\n No. I have to go away now. Im sorry, Lotte. Im so sorry. CUT TO: INT. CRAIG AND MAXINES OFFICE - MORNING\n\n Craig enters with red-rimmed eyes. Maxine sits at her\n desk, actually looking kind of radiant. MAXINE\

In [3]:
global_result["communities"]

["# The Wedding-Guest and the Ancient Mariner\n\nThis community centers around the interaction between the Wedding-Guest and the Ancient Mariner, highlighting the emotional and narrative dynamics that unfold during their encounter. The Wedding-Guest serves as a bridge between the audience and the Mariner's haunting tale, reflecting themes of fear, curiosity, and transformation amidst the backdrop of a wedding celebration.\n\n## The Wedding-Guest as a narrative conduit\n\nThe Wedding-Guest is a pivotal character who embodies the audience's engagement with the Ancient Mariner's tale. His initial fear and subsequent curiosity reflect the emotional journey that listeners often experience when confronted with unsettling stories. This dynamic illustrates how storytelling can evoke strong emotions and create a connection between the storyteller and the audience. The Wedding-Guest's reactions serve to amplify the Mariner's narrative, making the audience's experience more immersive and impactfu

In [6]:
import Config.constants as const
import json
from MultiAgent import *
from Utils import *

retriever = Retriever()
multi_agent = MultiAgent_RAG()
# all_communities = retriever.global_retrieve(0)["communities"]

# batches = []
# for i in range(0, len(all_communities), const.NODE_BATCH_SIZE):
#     batch_communities = all_communities[i:i + const.NODE_BATCH_SIZE]
#     batches.append({
#         "user_query": "What is the meaning of life",
#         "sub_queries": [],
#         "batch_communities": batch_communities,
#         "batch_size": len(batch_communities),
#     })


# all_scores = multi_agent.topic_reranking_run_batch_async(node_batch_inputs=batches).relevant_scores
# print(all_scores)
# print(len(all_scores))
# print(len(all_communities))

multi_agent.user_query_classification_run(user_query="Why does the author choose to use first-person point of view in this article?")





2024-09-10 14:44:13,780 - 133405013833216 - milvus_client.py-milvus_client:658 - DEBUG: Created new connection using: eb1140a954114038bfee403fbc57e0dc


Initializing sparse embedder...
Embedder initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
GraphDatabase initialized.
Retriever initialized
Agents initialized (with model: "gpt-4o-mini" and temperature: "0.1")
Tasks initialized
LLMMA RAG System initialized


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mThought: I now can give a great answer  
Final Answer: UserQueryClassificationResult(needs_retrieval=False, domain_range_score=36, justification="The query asks about the author's choice of narrative perspective in an article, which can typically be answered through analysis of the text itself rather than requiring external information retrieval. The scope is moderate as it pertains to literary analysis, which may involve some integration of concepts but does not necessitate specific factual lookup.")[0m

[1m> Finished chain.[0m
[93m Error parsing JSON: Expecting value: line 1 column 1 (char 0). Attempting to handl

UserQueryClassificationResult(needs_retrieval=False, domain_range_score=36, justification="The query asks about the author's choice of narrative perspective in an article, which can typically be answered through analysis of the text itself rather than requiring external information retrieval. The scope is moderate as it pertains to literary analysis, which may involve some integration of concepts but does not necessitate specific factual lookup.")