# Load dataset from NarrativeQA

We use Narrative from Question Generation and RAG Evaluation

In [5]:
import pandas as pd
from Utils import *
data_loader = DatasetLoader()
from datasets import load_dataset
import os

vector_DB = VectorDatabase()
embedder = Embedder()
data_processor = DataProcessor(embedder=embedder, vectordatabase=vector_DB)

DatasetLoads initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
Initializing sparse embedder...
Embedder initialized
Data Processor initialized


[nltk_data] Downloading package words to /home/yarikama/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [6]:
vector_DB.list_collections()

['narrative_qa_full_gpu',
 'alice',
 'narrative_test_cpu',
 'narrative_qa_standard_gpu',
 'alice2',
 'narrative_qa_sample_gpu',
 'squad',
 'hotpot_qa',
 'narrative_test']

In [7]:
def extract_narrativeqa_text(split='train'):
    # 載入數據集
    dataset = load_dataset("deepmind/narrativeqa", split=split)
    
    # 用於存儲唯一文本的字典
    unique_summaries = {}
    unique_documents = {}
    
    total_summary_chars = 0
    total_document_chars = 0
    
    # 用於存儲問題和答案的列表
    questions = []
    answers = []
    
    # 從每個示例中提取文本
    for example in dataset:
        summary = example['document']['summary']['text']
        document = example['document']['text']
        metadata = example['document']['kind'] + "\\" + example['document']['summary']['title']
        
        # 只有當摘要和文檔都是唯一的時才添加到 df_doc
        if summary not in unique_summaries and document not in unique_documents:
            unique_summaries[summary] = metadata
            unique_documents[document] = metadata
            total_summary_chars += len(summary)
            total_document_chars += len(document)
        
        # 總是添加問題和答案到 df_qa
        questions.append(example['question']['text'])
        answers_text = ""
        for answer in example['answers']:
            answers_text += answer['text'] + " "
        answers.append(answers_text)
            
    # 從字典創建列表
    summaries = list(unique_summaries.keys())
    documents = list(unique_documents.keys())
    metadata = [unique_summaries[s] for s in summaries]  # 將元數據與摘要對齊
    
    # 計算平均值
    num_examples = len(summaries)
    avg_summary_chars = total_summary_chars / num_examples if num_examples > 0 else 0
    avg_document_chars = total_document_chars / num_examples if num_examples > 0 else 0
    
    # 創建 df_doc DataFrame
    df_doc = pd.DataFrame({
        'summary': summaries,
        'document': documents,
        'metadata': metadata
    })
    
    # 創建 df_qa DataFrame
    df_qa = pd.DataFrame({
        'questions': questions,
        'ground_truths': answers,
        'answers': ['' for _ in range(len(questions))],
        'context': ['' for _ in range(len(questions))]
    })
    
    print(f'唯一文檔數量: {num_examples}')
    print(f'問答對數量: {len(df_qa)}')
    print(f'平均摘要長度: {avg_summary_chars:.2f} 字符')
    print(f'平均文檔長度: {avg_document_chars:.2f} 字符')
    
    return df_doc, df_qa

In [19]:
# df_doc, df_qa = extract_narrativeqa_text(split="train")
# df_doc.to_parquet(".parquet/narrative_qa_doc_full.parquet")
# df_qa.to_parquet(".parquet/narrative_qa_qa_full.parquet")
df_doc = pd.read_parquet(".parquet/narrative_qa_doc_full.parquet")
df_qa = pd.read_parquet(".parquet/narrative_qa_qa_full.parquet")

df_doc_sample = df_doc.sample(frac=0.01, random_state=42)
# df_qa_sample = df_qa.sample(frac=0.05, random_state=42)
df_doc_sample.to_parquet(".parquet/narrative_qa_doc_sample_11.parquet")
# df_qa_sample.to_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
# df_doc_sample

In [20]:
# load parquet
df_doc_sample = pd.read_parquet(".parquet/narrative_qa_doc_sample_11.parquet")



# Transform the dataframe into .txts

In [21]:

import re
from bs4 import BeautifulSoup
import unicodedata

def preprocess_content(content: str) -> str:
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()

    # unify to NFKC normalization form
    text = unicodedata.normalize('NFKC', text)

    # remove url
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # remove extra whitespace
    text = re.sub(r'[ \t]+', ' ', text).strip()

    # remove special characters, but keep some punctuation
    text = re.sub(r'[^\w\s.,!?;:()"-]', '', text)

    # unify quotes
    text = text.replace('"', '"').replace('"', '"')

    # remove consecutive punctuation
    text = re.sub(r'([.,!?;:])\1+', r'\1', text)

    # ensure there is appropriate whitespace between sentences
    text = re.sub(r'([.,!?;:])\s*', r'\1 ', text)

    return text.strip()

def write_text_to_files_by_metadata(df):
    # Ensure the dataframe has the required columns
    if not all(col in df.columns for col in ['document', 'metadata']):
        raise ValueError("Dataframe must contain 'document' and 'metadata' columns")

    os.makedirs(".txt/", exist_ok=True)
    
    # Dictionary to keep track of file handles
    file_handles = {}

    try:
        for _, row in df.iterrows():
            metadata = row['metadata']
            metadata = metadata.replace(" ", "_").replace("/", "_").replace("\\", "_").replace(":", "_").replace("\"", "")
            document = preprocess_content(row['document'])

            # Create or get file handle
            if metadata not in file_handles:
                filename = f".txt/{metadata}.txt"
                file_handles[metadata] = open(filename, 'a', encoding='utf-8')

            # Write document to file
            file_handles[metadata].write(document + "\n\n")  # Add two newlines for separation

    finally:
        # Close all file handles
        for handle in file_handles.values():
            handle.close()

    print(f"Files created: {', '.join(f'{metadata}.txt' for metadata in file_handles.keys())}")

In [23]:
write_text_to_files_by_metadata(df_doc_sample)


Files created: movie_30_Minutes_or_Less.txt, gutenberg_Up_the_Down_Staircase.txt, movie_Cobb_(film).txt, movie_The_Losers_(film).txt, gutenberg_The_Velveteen_Rabbit.txt, gutenberg_The_Two_Noble_Kinsmen.txt, movie_Rise_of_the_Guardians.txt, gutenberg_Honorine_(novel).txt, gutenberg_The_Adventure_of_the_Cardboard_Box.txt, movie_Yes_Man_(film).txt, movie_Fight_Club.txt


# Embedder into Milvus (GPU) for txts

In [8]:
data_processor.directory_files_process("narrative_qa_full_gpu", ".txt/", True, True)

aggregating documents:   1%|          | 12/1102 [00:00<00:19, 55.33it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   2%|▏         | 18/1102 [00:00<00:26, 41.06it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   3%|▎         | 32/1102 [00:00<00:20, 52.07it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   5%|▍         | 53/1102 [00:00<00:14, 72.21it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   7%|▋         | 76/1102 [00:01<00:11, 90.42it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   9%|▊         | 96/1102 [00:01<00:13, 77.17it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  10%|▉         | 105/1102 [00:01<00:14, 67.73it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  11%|█▏        | 126/1102 [00:01<00:13, 71.26it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  13%|█▎        | 146/1102 [00:02<00:12, 74.13it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  14%|█▍        | 155/1102 [00:02<00:14, 64.52it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  16%|█▌        | 171/1102 [00:02<00:14, 63.37it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  16%|█▌        | 178/1102 [00:02<00:15, 59.08it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  18%|█▊        | 193/1102 [00:02<00:15, 59.73it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  19%|█▉        | 211/1102 [00:03<00:14, 63.30it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  20%|█▉        | 218/1102 [00:03<00:15, 56.40it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  21%|██▏       | 235/1102 [00:03<00:13, 66.62it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  23%|██▎       | 255/1102 [00:03<00:10, 77.78it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  25%|██▍       | 272/1102 [00:04<00:10, 77.96it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  27%|██▋       | 293/1102 [00:04<00:09, 84.57it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  29%|██▊       | 316/1102 [00:04<00:08, 91.09it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  30%|███       | 336/1102 [00:04<00:09, 80.27it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  31%|███▏      | 345/1102 [00:04<00:09, 79.89it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  34%|███▍      | 376/1102 [00:05<00:07, 91.62it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  36%|███▋      | 400/1102 [00:05<00:06, 102.20it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  37%|███▋      | 411/1102 [00:05<00:08, 85.17it/s] 

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  39%|███▉      | 430/1102 [00:05<00:08, 81.24it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  40%|███▉      | 439/1102 [00:05<00:08, 80.09it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  42%|████▏     | 459/1102 [00:06<00:08, 76.88it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  44%|████▍     | 483/1102 [00:06<00:07, 86.48it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  45%|████▌     | 498/1102 [00:06<00:06, 97.56it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  46%|████▌     | 508/1102 [00:06<00:06, 89.42it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  48%|████▊     | 526/1102 [00:07<00:08, 69.55it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  50%|█████     | 552/1102 [00:07<00:06, 89.23it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  52%|█████▏    | 569/1102 [00:07<00:04, 109.47it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  54%|█████▎    | 592/1102 [00:07<00:06, 81.62it/s] 

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  55%|█████▍    | 601/1102 [00:07<00:06, 80.73it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  56%|█████▌    | 618/1102 [00:08<00:07, 68.69it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  58%|█████▊    | 641/1102 [00:08<00:05, 86.98it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  60%|██████    | 663/1102 [00:08<00:04, 88.56it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  62%|██████▏   | 682/1102 [00:08<00:05, 76.37it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  64%|██████▎   | 701/1102 [00:09<00:05, 74.99it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  64%|██████▍   | 709/1102 [00:09<00:05, 68.17it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  65%|██████▌   | 721/1102 [00:09<00:04, 79.69it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  67%|██████▋   | 739/1102 [00:09<00:05, 67.84it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  68%|██████▊   | 747/1102 [00:09<00:05, 59.79it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  69%|██████▉   | 764/1102 [00:10<00:05, 62.53it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  71%|███████▏  | 787/1102 [00:10<00:04, 70.20it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  73%|███████▎  | 808/1102 [00:10<00:03, 80.31it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  75%|███████▍  | 826/1102 [00:10<00:03, 81.01it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  77%|███████▋  | 845/1102 [00:11<00:03, 79.99it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  77%|███████▋  | 854/1102 [00:11<00:03, 74.42it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  80%|███████▉  | 881/1102 [00:11<00:02, 81.94it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  82%|████████▏ | 904/1102 [00:11<00:02, 93.03it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  83%|████████▎ | 914/1102 [00:12<00:02, 73.40it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  85%|████████▍ | 934/1102 [00:12<00:02, 82.01it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  86%|████████▋ | 952/1102 [00:12<00:01, 83.50it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  87%|████████▋ | 961/1102 [00:12<00:02, 67.96it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  89%|████████▉ | 982/1102 [00:12<00:01, 76.51it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  91%|█████████ | 999/1102 [00:13<00:01, 64.99it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  92%|█████████▏| 1014/1102 [00:13<00:01, 62.18it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  94%|█████████▎| 1033/1102 [00:13<00:00, 74.40it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  96%|█████████▌| 1053/1102 [00:13<00:00, 81.28it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  96%|█████████▋| 1062/1102 [00:14<00:00, 78.46it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  98%|█████████▊| 1079/1102 [00:14<00:00, 72.55it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  99%|█████████▉| 1096/1102 [00:14<00:00, 63.63it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents: 100%|██████████| 1102/1102 [00:14<00:00, 75.03it/s]


Data preprocessing done.
Data preprocessing done.
Now splitting document...
Split 1102 documents into 810567 chunks.
Document split successfully.

Now fitting sparse embedder with new documents...
Sparse embedder fitted and saved successfully.

Now generate embeddings and storing them in Milvus...
Successfully created collection narrative_qa_full_gpu with dense dimension 1536 and sparse embeddings.
Successfully created indexes for collection narrative_qa_full_gpu.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Content: I...
Content: I...
Inserted data into Milvus.
Inserted data int

# Import GRAPH RAG data to neo4j

In [None]:
knowledge_DB = KnowledgeGraphDatabase()

In [None]:
knowledge_DB.transform_graph_rag_to_neo4j(datapath="../graph_rag_sample/output/20240906-153334/artifacts")

In [None]:
retriever = Retriever()
retriever.global_retrieve(0)

# Test Modular RAG

In [9]:
import pandas as pd
rag_evaluation_dataset = pd.read_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
dataset_queries = rag_evaluation_dataset["questions"].tolist()[1::-1]
print(dataset_queries)
# print(vector_DB.list_collections())
answer = rag_evaluation_dataset["ground_truths"].tolist()[1::-1]
print(answer)




['Where does the fifth incarnation take place?', 'During the last decade what kind of life has Alexis Paulvitch lived?']
['New York New York ', 'A life of abuse and disease among tribal people. A life of abuse and disease. ']


In [6]:
from Module import *
from Config.output_pydantic import *
workflow = WorkFlowModularHybridRAG()

results = workflow.graph.stream({
    "specific_collection": "narrative_qa_standard_gpu",
    "dataset_queries": dataset_queries,
    "all_results": [],
    "all_contexts": [],
    "repeat_times": 0,
})

for result in results:
    print(result)


2024-09-11 20:16:01,342 - 136280243418624 - milvus_client.py-milvus_client:658 - DEBUG: Created new connection using: f0fd51639f454525b8153dc9036c5af5


Initializing sparse embedder...
Embedder initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
GraphDatabase initialized.
Retriever initialized
Agents initialized (with model: "gpt-4o-mini" and temperature: "0.1")
Tasks initialized
MultiAgent RAG System initialized
index =  0
total =  2
{'update_next_query_node': {'user_query': 'Where does the fifth incarnation take place?'}}
[1m[95m [2024-09-11 20:16:01][DEBUG]: == Working Agent: Query Classifier[00m
[1m[95m [2024-09-11 20:16:01][INFO]: == Starting Task: 
Analyze the following user query and determine if it requires information retrieval to be answered accurately, while also evaluating its global or local scope:

User Query: "Where does the fifth incarnation take place?"

Your task is to:
1. Classify this query as either requiring retrieval or not
2. Evaluate the query's domain range score

Consider the following guidelines:

1. Queries that typically require retrieval:
   - Specifi



all_information =  ["Chapter IV contributes to the development of Dorothy Vernon's story and relationships.", 'Chapter IV is the fourth chapter of the book, contributing to the unfolding narrative and character arcs.', "Chapter V continues to advance Dorothy Vernon's narrative and character development.", 'Chapter VI furthers the plot involving Dorothy Vernon and her interactions.', 'Chapter V is the fifth chapter of the book, continuing the exploration of themes and character relationships.', "Chapter XV continues to explore Dorothy Vernon's character and plot.", 'Chapter XV is the fifteenth chapter of the book, contributing to the overall story arc.', 'The community revolves around the intricate relationships between Sir John, Lord Rutland, and Dorothy, set against the backdrop of a longstanding family feud. The emotional entanglements and moral dilemmas faced by these characters reflect broader societal themes of loyalty, honor, and the consequences of familial legacies.', 'ïProject

2024-09-11 20:16:08,734 - 136269682181824 - base_events.py-base_events:1758 - ERROR: Task was destroyed but it is pending!
task: <Task pending name='Task-1464' coro=<Crew.kickoff_for_each_async.<locals>.run_crew() running at /opt/anaconda3/envs/poetry310/lib/python3.10/site-packages/crewai/crew.py:514> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at /opt/anaconda3/envs/poetry310/lib/python3.10/asyncio/futures.py:385, Task.task_wakeup()]>>


[1m[95m [2024-09-11 20:16:40][DEBUG]: == Working Agent: Reranker[00m
[1m[95m [2024-09-11 20:16:40][INFO]: == Starting Task: 
Your task is to evaluate each retrieved data item's relevance to the user's query or sub-queries relevant to the user's query.
-----User Query-----
"Where does the fifth incarnation take place?"

-----Sub-queries-----
"[]"

Your specific responsibilities are:
1. Compare each data item to the user's query and sub-queries.
2. Assign a relevance score to each data item based on how well it matches the user's query from 0 to 100.
   - Higher scores indicate better relevance.
3. Create a list of these relevance scores, don't include any other information.
4. CRITICAL: Ensure the number of scores in your output list EXACTLY matches the number of data items :5 in the input.

You will receive a list of 5 data items. 
----------------batch_retrieved_data----------------
[{'content': 'bright paradise whence he had descended." Of the ninth incarnation of India, the Sav



[1m[92m [2024-09-11 20:16:42][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 15, 20, 5, 0]

[00m
[1m[92m [2024-09-11 20:16:42][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 20, 15, 5, 25]

[00m
[1m[92m [2024-09-11 20:16:42][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 15, 20, 5, 0]

[00m
[1m[92m [2024-09-11 20:16:42][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 20, 15, 5, 25]

[00m
{'detailed_search_node': {'detailed_search_result': DetailedSearchResult(sorted_retrieved_data=[{'content': '3. _His last Hours_.--"When Chrishna knew his hour had come, forbidding\nhis disciples to follow him, he repaired to the bank of the River\nGanges; and having performed three ablutions, he knelt down, and looking\nup to heaven, he prayed to Brahma." While nailed to the cross, the\ntree on which he was suspended became suddenly covered with great red\nflowers, which diffused their fragrance all around. And it is said\nhe often appeared to his dis



[32;1m[1;3mI now can give a great answer  
Final Answer: 

**1. Themes and Topics:**

**A. Incarnations and Divine Figures:**
- **Chrishna's Last Hours:** "When Chrishna knew his hour had come, forbidding his disciples to follow him, he repaired to the bank of the River Ganges; and having performed three ablutions, he knelt down, and looking up to heaven, he prayed to Brahma." This indicates a significant moment of transition for Chrishna, emphasizing his divine nature.
- **Second Advent of Chrishna:** "There is not a Hindoo or a Brahmin who does not look upon the second coming of Chrishna as an established article of faith." The Vedas and Gita prophesy his return, describing him as "crowned with lights" and stating that "all men, all animated beings, beasts, birds, trees, and plants, will chant his praises."
- **Sakia (Buddha) and the Ninth Avatar:** The text mentions a saint who recognized the new avatar, Sakia, and declared him to be the "great avatar (Savior or prophet)." This su



[32;1m[1;3mI now can give a great answer.  
Final Answer: Based on the information available, I must inform you that I don't have enough specific information to answer your query regarding the location of the fifth incarnation. The data retrieved does not explicitly mention where the fifth incarnation takes place. However, I can provide some context based on general knowledge and the themes discussed in the provided material.

The text references significant figures in Hinduism, such as Chrishna and Sakia (Buddha), and discusses themes of resurrection and ascension, but it does not specify the geographical location associated with the fifth incarnation. In Hindu belief, incarnations of deities often symbolize various aspects of divine intervention in the world, and their locations can be tied to significant cultural or spiritual sites. 

For instance, the River Ganges is mentioned in relation to Chrishna's last hours, indicating its importance in Hindu spirituality. However, without 



all_information =  ["The community centers around the audience's engagement with various performers, including Derek Mantini, Malkovich, and Dorothy. The relationships highlight the emotional investment of the audience and the performers' responses to their reactions, reflecting broader themes of aspiration, competition, and social dynamics in performance contexts.", "Malkovich's bank account represents financial resources that Maxine mentions as a means of livelihood for them.", '\n Its my job to ask the questions. Yours to answer them. MALKOVICH\n Says who? HARRY S. TRUMAN PUPPET\n Says me. Do you dream often? MALKOVICH\n Do you? We see the audience fidgeting in their seats, coughing. CUT TO: INT. BROADHURST BACKSTAGE - CONTINUOUS\n\n The dialogue drones on as Maxine watches coolly from the\n wings. She drags on a cigarette. Mr. Flemmer, dressed as\n a stagehand, stands behind Maxine. He also watches the\n actors, with an occasional sideways glance at Maxine. MAXINE\n (without turnin



all_scores =  [10, 5, 15, 20, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 5, 15, 20, 5, 0]
[1m[95m [2024-09-11 20:17:00][DEBUG]: == Working Agent: Topic Searcher[00m
[1m[95m [2024-09-11 20:17:00][INFO]: == Starting Task: 
You have received multiple pieces of data related to a user query or sub-queries decomposed from the user query by descending relevance scores.

----------------User Query----------------
During the last decade what kind of life has Alexis Paulvitch lived?

----------------Sub-queries----------------
[]

Your task is to analyze this batch of data and help prepare for a vector database search to answer the user's query.

The batch of data may include:
- Communities extracted from the article
- Chunks extracted from the article
- Entities extracted from the article
- Relations extracted from the articles

Follow these steps:
1. If no data is provided, return 2 empty list.
2. Carefully read and analyze all provided data.
3. Based on these data, imagine what relevant docum

2024-09-11 20:17:08,953 - 136269682181824 - base_events.py-base_events:1758 - ERROR: Task was destroyed but it is pending!
task: <Task pending name='Task-5693' coro=<Crew.kickoff_for_each_async.<locals>.run_crew() running at /opt/anaconda3/envs/poetry310/lib/python3.10/site-packages/crewai/crew.py:514> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at /opt/anaconda3/envs/poetry310/lib/python3.10/asyncio/futures.py:385, Task.task_wakeup()]>>
2024-09-11 20:17:08,953 - 136269682181824 - base_events.py-base_events:1758 - ERROR: Task was destroyed but it is pending!
task: <Task pending name='Task-5697' coro=<Crew.kickoff_for_each_async.<locals>.run_crew() running at /opt/anaconda3/envs/poetry310/lib/python3.10/site-packages/crewai/crew.py:514> wait_for=<Future pending cb=[_chain_future.<locals>._call_check_cancel() at /opt/anaconda3/envs/poetry310/lib/python3.10/asyncio/futures.py:385, Task.task_wakeup()]>>


[1m[95m [2024-09-11 20:17:46][DEBUG]: == Working Agent: Reranker[00m
[1m[95m [2024-09-11 20:17:46][INFO]: == Starting Task: 
Your task is to evaluate each retrieved data item's relevance to the user's query or sub-queries relevant to the user's query.
-----User Query-----
"During the last decade what kind of life has Alexis Paulvitch lived?"

-----Sub-queries-----
"[]"

Your specific responsibilities are:
1. Compare each data item to the user's query and sub-queries.
2. Assign a relevance score to each data item based on how well it matches the user's query from 0 to 100.
   - Higher scores indicate better relevance.
3. Create a list of these relevance scores, don't include any other information.
4. CRITICAL: Ensure the number of scores in your output list EXACTLY matches the number of data items :5 in the input.

You will receive a list of 5 data items. 
----------------batch_retrieved_data----------------
[{'content': 'The Marjorie W. had been chartered by a syndicate of wealthy



[1m[92m [2024-09-11 20:17:49][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 15, 20, 25, 30]

[00m
{'detailed_search_node': {'detailed_search_result': DetailedSearchResult(sorted_retrieved_data=[{'content': "Paulvitch had taken to the jungle when he had seen the beasts of Tarzan\nand their savage lord swarm the deck of the Kincaid, and in his terror\nlest Tarzan pursue and capture him he had stumbled on deep into the\njungle, only to fall at last into the hands of one of the savage\ncannibal tribes that had felt the weight of Rokoff's evil temper and\ncruel brutality.  Some strange whim of the chief of this tribe saved\nPaulvitch from death only to plunge him into a life of misery and torture.  For ten years he had been the butt of the village, beaten and\nstoned by the women and children, cut and slashed and disfigured by the\nwarriors; a victim of often recurring fevers of the most malignant\nvariety.  Yet he did not die.  Smallpox laid its hideous clutches upon\nhim; le



[32;1m[1;3mI now can give a great answer
Final Answer: 

**Theme 1: Life of Alexis Paulvitch**
- **Background and Initial Circumstances**: 
  - "Paulvitch had taken to the jungle when he had seen the beasts of Tarzan and their savage lord swarm the deck of the Kincaid, and in his terror lest Tarzan pursue and capture him he had stumbled on deep into the jungle, only to fall at last into the hands of one of the savage cannibal tribes that had felt the weight of Rokoff's evil temper and cruel brutality."
  - "Some strange whim of the chief of this tribe saved Paulvitch from death only to plunge him into a life of misery and torture."
  
- **Physical and Mental State**: 
  - "For ten years he had been the butt of the village, beaten and stoned by the women and children, cut and slashed and disfigured by the warriors; a victim of often recurring fevers of the most malignant variety."
  - "Yet he did not die. Smallpox laid its hideous clutches upon him; leaving him unspeakably branded wit

In [5]:
all_information =  [' That is not allowed. My God, you are supposed to be one of us. You know you must never partake of\n Malkovich by yourself! LOTTE\n No, I didnt know that. LESTER\n Oh, didnt anyone show you the\n indoctrination video? LOTTE\n No. LESTER\n Oh, sorry. Right this way. CUT TO: INT. SCREENING ROOM - NIGHT\n\n Lotte site next to Lester in the darkened auditorium. The projector whirs. The screen lights up. TITLE: SO YOU WANT TO BE JOHN MALKOVICH\n\n A much younger Lester addresses the camera in this black\n and white film, which seems to have been made in the 50s. LESTER ON FILM\n Welcome, my fellow Malkovichians. As you may already know, today a\n baby was born into this sad world. We see a shot of a newborn. LESTER ON FILM (CONTD)\n His name is John Horatio Hannibal\n Malkovich. And we are the keepers\n of the door to his soul. One day, when his brain is big enough, we\n will all journey into his head and\n live there for all eternity. Following\n the teachings of our leader Karl Marx, we will build the ultimate communist\n community, one body and hundreds, maybe thousands, of brains inside\n working together to form a super\n human intellect capable of curing\n disease, stopping all war, and\n ruling the world with a benevolent\n fist. We will take a wife, a woman\n of uncommon beauty and intellect, who\n is, as yet, still an infant herself. We see a photo of another infant, this one with a ribbon in\n her hair. LESTER ON FILM (CONTD)\n Her name is Floris Horatia Hannibella\n DeMent. LOTTE\n Does Floris know that shes the\n chosen? LESTER\n Well, I tried to explain it to her, but. Lester points to his ear and shrugs. CUT TO: INT. MALKOVICHS BEDROOM - NIGHT\n\n Malkovich and Maxine lie naked on the bed, looking quite\n relaxed. MAXINE\n You still there, sweets? MALKOVICH\n Yeah. Ive figured out how to hold\n on as long as I want. Oddly enough, its all in the wrists. MAXINE\n Wow. (little girl pout)\n Do a puppet show for me,', ' but. Lester points to his ear and shrugs. CUT TO: INT. MALKOVICHS BEDROOM - NIGHT\n\n Malkovich and Maxine lie naked on the bed, looking quite\n relaxed. MAXINE\n You still there, sweets? MALKOVICH\n Yeah. Ive figured out how to hold\n on as long as I want. Oddly enough, its all in the wrists. MAXINE\n Wow. (little girl pout)\n Do a puppet show for me, Craig honey. MALKOVICH\n You mean with Malkovich? MAXINE\n Id love to see your work. MALKOVICH\n (pleased)\n Really? Yeah. Okay. Malkovich leans over and kisses her, then gets up. MALKOVICH (CONTD)\n Ill do something I call "Craigs\n Dance of Despair and Disillusionment. "\n\n Malkovich performs the same dance that the Craig pupper\n did at the beginning of the film. It is exactly the same, complete with impossible somersaults and perspiring brow. He finishes by falling to his knees and weeping. MAXINE\n (moved)\n That was incredible. Youre brilliant! MALKOVICH\n You see, Maxine, it isnt just playing\n with dolls. MAXINE\n Youre right, my darling, its\n so much more. Its playing with\n people! Malkovich kisses Maxine. She snuggles close to him. MAXINE\n Stay in him forever? MALKOVICH\n (as Malkovich, screaming)\n No! (as Craig, calmly)\n But how will we make a living, my love, if our clientele doesnt\n have access to our product? MAXINE\n Well, well have all the money in\n Malkovichs bank account, plus he\n still gets acting work occasionally. MALKOVICH\n (as Malkovich, breaking through)\n No! Please! (as Craig, to Malkovich)\n Shut up, will you? Were trying to\n think here. (to Maxine)\n It is sort of like being a puppeteer. I like that about it. MAXINE\n No one would ever have to know its\n not him. MALKOVICH\n (an idea)\n Wait a minute! What if everybody knew? What if we presented Malkovich as the\n worlds most complicated puppet and\n me as the only puppeteer sophisticated\n enough to work him? Wed wipe the floor\n with the Great Mantini! MAXINE\n Oh, Craiggy,', ' enters with a greasy white paper bag. FLEMMER\n Have a seat. I wracking my brain\n over this Malkovich thing. LESTER\n We saw his show at the Luxor last\n night. FLEMMER\n (impressed)\n Vegas? Whatd you think? LESTER\n The kids got talent. Youve never\n seen Malkovich like this. Schwartz\n had him up there singing and dancing. Impressions. FLEMMER\n Impressions? Those are hard. LESTER\n Very talented son of a bitch. Too bad\n we cant kill him. FLEMMER\n I suppose I could come to him in a\n dream. I dont know. Thats the best\n I can think of right now. LESTER\n A scary dream? FLEMMER\n No, a sexy dream. Of course, a scary\n dream. LESTER\n (noncommittally)\n I like that. CUT TO: INT. HOTEL SUITE - NIGHT\n\n Malkovich sits on the floor in silk pajamas. He is\n surrounded by newspaper clippings. He is drinking\n champagne from the bottle. Maxine is at a dressing\n table, brushing her hair. MALKOVICH\n They love me, darling! "Craig Schwartz\n is fantastic! " The New York Times. "If only Craig Schwartz had always\n been inside Malkovich! " Womens Wear\n Daily. "Craig Schwartz - The worlds\n greatest puppeteer! " Paul Wunder, WBAI Radio. MAXINE\n Oh, darling. Its a dream come true. Were going to ride this straight to\n the top. MALKOVICH\n Sleepy suddenly. MAXINE\n Busy day, my little fire chief. Why\n dont you climb into bed, and Ill\n meet you there in just. But Malkovich is already passed out on the floor on top of\n his clippings. Maxine smiles maternally, gets up and puts \n blanket over him. We stay on Malkovichs face. DISSOLVE TO: INT. HELL - NIGHT\n\n Craig wanders across a jagged, rocky landscape. Geysers of\n flame shoot up around him. The sky is red. He is frightened. He arrives at a desk. The man behind the desk is facing away\n from him. He swivels to face Craig. It is Flemmer, looking\n the same as usual except for little red horns and a sinister\n grin', ')\n Why arent you at work? LOTTE\n Ive been going over and over my\n experience last night. It was amazing. (beat)\n Ive decided Im a transsexual. Isnt\n that the craziest thing? CRAIG\n What, are you nuts? Thats Oprah\n talking. LOTTE\n Everything felt right for the first\n time. I need to go back to make sure, then if the feeling is still there. Im going to speak to Dr. Feldman\n about sexual reassignment surgery. CRAIG\n This is absurd. Besides Feldmans an\n allergist. If youre going to do\n something, do it right. CRAIG (contd)\n (beat)\n Its just the thrill of seeing through\n someone elses eyes, sweetie. Itll\n pass. LOTTE\n Dont stand in the way of my\n actualization as a man, Craig. MAXINE\n (hanging up the phone)\n Let her go, Craig. I mean him. "\n\n CRAIG\n (anything for Maxine) \n Yeah, okay. (opens the portal door) \n Ill pick you up. Lotte enters. Craig closes the door. stands there. MAXINE\n You better hurry. Traffic. Maxine tosses Craig his car keys. He heads out the door. Maxine dials the phone. MAXINE (CONTD)\n (into phone)\n Davey? Max. Get me John Malkovichs\n home phone? Thats great. Love ya\n and owe ya. CUT TO: INT. JOHN MALKOVICHS LIVING ROOM - DAY\n\n Malkovichs POV. He sits on the couch. drinks coffee, and reads a copy of Awake and Sing. Bach plays on the\n stereo in the background. MALKOVICH\n (reading aloud)\n So you believe in God. you got\n something for it? You worked for\n all the capitalists. You harvested\n the fruit from your labor? You got\n God! LOTTE (V. 0. )\n What raw, animal power! MALKOVICH\n But the past comforts you? The\n present smiles on you, yes? The phone rings. Malkovich puts down the script, and picks\n up the phone. MALKOVICH (CONTD)\n (into phone)\n Yeah? MAXINE (0. S. )\n (telephone voice) \n Mr. Malkovich? MALKOVICH\n Whos calling? MAXINE (0. S. )\n You dont know me, but Im', '\n Its my job to ask the questions. Yours to answer them. MALKOVICH\n Says who? HARRY S. TRUMAN PUPPET\n Says me. Do you dream often? MALKOVICH\n Do you? We see the audience fidgeting in their seats, coughing. CUT TO: INT. BROADHURST BACKSTAGE - CONTINUOUS\n\n The dialogue drones on as Maxine watches coolly from the\n wings. She drags on a cigarette. Mr. Flemmer, dressed as\n a stagehand, stands behind Maxine. He also watches the\n actors, with an occasional sideways glance at Maxine. MAXINE\n (without turning around)\n Keep your eyes in your pants, old\n timer. CUT TO: INT. THE BROADHURST LOBBY - A BIT LATER\n\n Its intermission. The lobby is crowded. Maxine moves\n through the crowd listening to snippets of conversation. Flemmer, now in a tuxedo, moves about also. First couple: THEATERGOER 1\n That Truman puppet is downright\n boring as the psychiatrist. THEATERGOER 2\n Its a wooden performance, really. Get it? Wooden? Second couple: THEATERGOER 3\n Whats with the Malkovich puppet? He was much better in Vegas when he\n played the piano with his feet. THEATERGOER 4\n I hate it when they try to stretch. Its like Woody Allen. Third couple: THEATERGOER 5\n They both stink! Im going across the\n street to second act Miss Saigon. CUT TO: INT. DRESSING ROOM - A FEW MINUTES LATER\n\n Malkovich watches himself in his dressing table mirror. Maxine enters, flops herself down on the couch and lights\n up a cigarette. MAXINE\n Youd better turn on the pyrotechnics, lover, cause right now youre running\n neck and neck with the dead president. And youre both in last place. Malkovich continues to watch himself in the mirror, nods his\n head. CUT TO: INT. CATWALK ABOVE STAGE - CONTINUOUS\n\n Mantini leans against a rail and smokes a cigarette. Charles\n Nelson Reilly, in a tuxedo, confers with him in hushed tones. CHARLES NELSON REILLY\n Youre doing beautifully, my boy. I\n wept at the speech about your wife. Flemmer materializes behind', "The community centers around the audience's engagement with various performers, including Derek Mantini, Malkovich, and Dorothy. The relationships highlight the emotional investment of the audience and the performers' responses to their reactions, reflecting broader themes of aspiration, competition, and social dynamics in performance contexts.", 'This community centers around the interactions between the crowd and key figures such as Robert Brown and Malkovich, highlighting the complex dynamics of audience engagement, spiritual influence, and the reactions to performances and events. The crowd serves as a multifaceted entity that shapes and is shaped by the narratives surrounding them.', "The community centers around Erroll, a character yearning for transformation and connection, and the financial implications of his desire to inhabit another person's identity. Key entities include dollars, the Fat Man, and Overeaters Anonymous, all of which reflect the economic and social dimensions of identity change and personal struggles.", "The community centers around Erroll, a character yearning for transformation and connection, and the financial implications of his desire to inhabit another person's identity. Key entities include dollars, the Fat Man, and Overeaters Anonymous, all of which reflect the economic and social dimensions of identity change and personal struggles.", 'The community centers around the tumultuous relationship between Jerome and Becky, characterized by aggression and conflict within various settings, including the Bakersfield House and social events. Their interactions reveal deeper societal themes of power dynamics, domestic violence, and the complexities of personal relationships.', "Malkovich is discussed in relation to Floris, indicating that she is to be a significant part of his future, as per the community's plans.", "The newborn is introduced in the context of Malkovich's life, indicating a connection to the future of the community's plans.", "Maxine's apartment is the setting for Malkovich's visit, indicating a significant moment in their relationship.", 'The purple silk robe worn by Malkovich complements his regal appearance, reinforcing his status and control.', 'The cabbie recognizes John Malkovich, indicating a relationship based on public recognition and celebrity status.', "Malkovich's bedroom is a personal space where he interacts with Maxine, indicating a private aspect of his life.", "The newspaper clippings highlight Malkovich's achievements, indicating a relationship of representation and public perception.", "The newspaper clippings highlight Malkovich's achievements, indicating a relationship of representation and public perception.", "Malkovich's bedroom is a personal space where he interacts with Maxine, indicating a private aspect of his life.", "Malkovich is a complex character who navigates a tumultuous landscape of confusion, fear, and manipulation. He experiences a range of emotions, from being overshadowed in performances to feeling controlled like a puppet by Craig Schwartz. Malkovich's journey includes delivering dramatic monologues on stage, showcasing his acting prowess, and engaging in a series of tormented performances, including a tap dance routine. His interactions with other characters, particularly Maxine and Lotte, reveal a tangled web of relationships marked by romantic entanglements and legal disputes over a portal he discovered, which he feels entitled to.\n\nThroughout the narrative, Malkovich grapples with his identity, often feeling manipulated and expressing a desire for control. He is portrayed as a powerful figure, receiving admiration from fans and critics alike, yet he also faces moments of vulnerability, such as experiencing a horrifying nightmare involving the Devil and feeling disoriented in surreal settings. His role as a puppeteer further emphasizes themes of control and performance, as he prepares for various theatrical endeavors while confronting the complexities of his relationships.\n\nMalkovich's character is central to the conflict within the story, as both Lotte and Elijah express concern over his influence on Craig. He is depicted as a talented performer, receiving praise and recognition for his work, yet he remains entangled in a narrative that questions the nature of identity and autonomy. Ultimately, Malkovich embodies the struggles of an artist caught between the demands of performance and the quest for personal truth, making him a significant figure in the unfolding drama.", "Malkovich's bank account represents financial resources that Maxine mentions as a means of livelihood for them.", 'Newspaper clippings that surround Malkovich, highlighting his achievements and the positive reviews he has received from various publications.', "Malkovich's bedroom is a private space where Malkovich and Maxine are depicted in an intimate setting, indicating a personal aspect of Malkovich's life.", "John Horatio Hannibal Malkovich is a multifaceted character who serves as the central figure in a narrative exploring themes of identity, control, and the surreal nature of existence. He is depicted in various states, from a vulnerable moment while showering to a contemplative figure engaged in philosophical reflection in his living room. Malkovich's involvement in the unfolding events is highlighted by his invitation to Maxine's apartment, indicating his integral role in the complex relationships that develop around him.\n\nHe is idolized by Lotte, who has created a shrine in his honor, showcasing her deep admiration and the extent to which she is influenced by his persona. This admiration culminates in a surreal experience where Lotte temporarily assumes Malkovich's identity, marking a pivotal moment in her journey of self-discovery. Additionally, Malkovich is portrayed as a well-respected actor of the 20th century, who becomes the subject of a supernatural portal that allows others to experience the world through his eyes, further complicating the notions of self and reality.\n\nIn a striking visual representation, Malkovich is depicted as a larger-than-life figure seated on a jeweled throne, adorned with a crown and robe, exuding an air of superiority. This imagery reinforces his status as an idol and a symbol of aspiration, particularly for characters like Erroll, who wishes to embody an idealized version of himself through Malkovich. Ultimately, John Malkovich emerges as a complex character whose identity and experiences serve as a lens through which the narrative explores deeper societal values and the intricacies of human connection."]
all_information

[' That is not allowed. My God, you are supposed to be one of us. You know you must never partake of\n Malkovich by yourself! LOTTE\n No, I didnt know that. LESTER\n Oh, didnt anyone show you the\n indoctrination video? LOTTE\n No. LESTER\n Oh, sorry. Right this way. CUT TO: INT. SCREENING ROOM - NIGHT\n\n Lotte site next to Lester in the darkened auditorium. The projector whirs. The screen lights up. TITLE: SO YOU WANT TO BE JOHN MALKOVICH\n\n A much younger Lester addresses the camera in this black\n and white film, which seems to have been made in the 50s. LESTER ON FILM\n Welcome, my fellow Malkovichians. As you may already know, today a\n baby was born into this sad world. We see a shot of a newborn. LESTER ON FILM (CONTD)\n His name is John Horatio Hannibal\n Malkovich. And we are the keepers\n of the door to his soul. One day, when his brain is big enough, we\n will all journey into his head and\n live there for all eternity. Following\n the teachings of our leader Karl Marx, we

# Test out Retriever for local retriever and global retriever

In [None]:
from Utils import *
retriever = Retriever()
knowledge_DB = KnowledgeGraphDatabase()
# knowledge_DB.create_entity_vector_index()
# knowledge_DB.create_community_weight()
global_result = retriever.global_retrieve(0)
local_result = retriever.local_retrieve(["What is the meaning of life"])

In [None]:
local_result

In [None]:
global_result["communities"]

In [None]:
import Config.constants as const
import json
from MultiAgent import *
from Utils import *

retriever = Retriever()
multi_agent = MultiAgent_RAG()
# all_communities = retriever.global_retrieve(0)["communities"]

# batches = []
# for i in range(0, len(all_communities), const.NODE_BATCH_SIZE):
#     batch_communities = all_communities[i:i + const.NODE_BATCH_SIZE]
#     batches.append({
#         "user_query": "What is the meaning of life",
#         "sub_queries": [],
#         "batch_communities": batch_communities,
#         "batch_size": len(batch_communities),
#     })


# all_scores = multi_agent.topic_reranking_run_batch_async(node_batch_inputs=batches).relevant_scores
# print(all_scores)
# print(len(all_scores))
# print(len(all_communities))

multi_agent.user_query_classification_run(user_query="Why does the author choose to use first-person point of view in this article?")



