# Load dataset from NarrativeQA

We use Narrative from Question Generation and RAG Evaluation

In [2]:
import pandas as pd
from Utils import *
data_loader = DatasetLoader()
from datasets import load_dataset
import os

vector_DB = VectorDatabase()
embedder = Embedder()
data_processor = DataProcessor(embedder=embedder, vectordatabase=vector_DB)

2024-09-14 15:41:41,999 - 133897845868032 - milvus_client.py-milvus_client:658 - DEBUG: Created new connection using: b70c5c42c1154f39a7ea0afb3c29d437


DatasetLoads initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
Initializing sparse embedder...
Embedder initialized
Data Processor initialized


[nltk_data] Downloading package words to /home/yarikama/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:
vector_DB.list_collections()

['narrative_test_cpu',
 'narrative_qa_standard_gpu',
 'narrative_qa_full_gpu',
 'alice',
 'hotpot_qa',
 'narrative_test',
 'alice2',
 'narrative_qa_sample_gpu',
 'squad']

In [7]:
def extract_narrativeqa_text(split='train'):
    # 載入數據集
    dataset = load_dataset("deepmind/narrativeqa", split=split)
    
    # 用於存儲唯一文本的字典
    unique_summaries = {}
    unique_documents = {}
    
    total_summary_chars = 0
    total_document_chars = 0
    
    # 用於存儲問題和答案的列表
    questions = []
    answers = []
    
    # 從每個示例中提取文本
    for example in dataset:
        summary = example['document']['summary']['text']
        document = example['document']['text']
        metadata = example['document']['kind'] + "\\" + example['document']['summary']['title']
        
        # 只有當摘要和文檔都是唯一的時才添加到 df_doc
        if summary not in unique_summaries and document not in unique_documents:
            unique_summaries[summary] = metadata
            unique_documents[document] = metadata
            total_summary_chars += len(summary)
            total_document_chars += len(document)
        
        # 總是添加問題和答案到 df_qa
        questions.append(example['question']['text'])
        answers_text = ""
        for answer in example['answers']:
            answers_text += answer['text'] + " "
        answers.append(answers_text)
            
    # 從字典創建列表
    summaries = list(unique_summaries.keys())
    documents = list(unique_documents.keys())
    metadata = [unique_summaries[s] for s in summaries]  # 將元數據與摘要對齊
    
    # 計算平均值
    num_examples = len(summaries)
    avg_summary_chars = total_summary_chars / num_examples if num_examples > 0 else 0
    avg_document_chars = total_document_chars / num_examples if num_examples > 0 else 0
    
    # 創建 df_doc DataFrame
    df_doc = pd.DataFrame({
        'summary': summaries,
        'document': documents,
        'metadata': metadata
    })
    
    # 創建 df_qa DataFrame
    df_qa = pd.DataFrame({
        'questions': questions,
        'ground_truths': answers,
        'answers': ['' for _ in range(len(questions))],
        'context': ['' for _ in range(len(questions))]
    })
    
    print(f'唯一文檔數量: {num_examples}')
    print(f'問答對數量: {len(df_qa)}')
    print(f'平均摘要長度: {avg_summary_chars:.2f} 字符')
    print(f'平均文檔長度: {avg_document_chars:.2f} 字符')
    
    return df_doc, df_qa

In [19]:
# df_doc, df_qa = extract_narrativeqa_text(split="train")
# df_doc.to_parquet(".parquet/narrative_qa_doc_full.parquet")
# df_qa.to_parquet(".parquet/narrative_qa_qa_full.parquet")
df_doc = pd.read_parquet(".parquet/narrative_qa_doc_full.parquet")
df_qa = pd.read_parquet(".parquet/narrative_qa_qa_full.parquet")

df_doc_sample = df_doc.sample(frac=0.01, random_state=42)
# df_qa_sample = df_qa.sample(frac=0.05, random_state=42)
df_doc_sample.to_parquet(".parquet/narrative_qa_doc_sample_11.parquet")
# df_qa_sample.to_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
# df_doc_sample

In [20]:
# load parquet
df_doc_sample = pd.read_parquet(".parquet/narrative_qa_doc_sample_11.parquet")



# Transform the dataframe into .txts

In [21]:

import re
from bs4 import BeautifulSoup
import unicodedata

def preprocess_content(content: str) -> str:
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()

    # unify to NFKC normalization form
    text = unicodedata.normalize('NFKC', text)

    # remove url
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # remove extra whitespace
    text = re.sub(r'[ \t]+', ' ', text).strip()

    # remove special characters, but keep some punctuation
    text = re.sub(r'[^\w\s.,!?;:()"-]', '', text)

    # unify quotes
    text = text.replace('"', '"').replace('"', '"')

    # remove consecutive punctuation
    text = re.sub(r'([.,!?;:])\1+', r'\1', text)

    # ensure there is appropriate whitespace between sentences
    text = re.sub(r'([.,!?;:])\s*', r'\1 ', text)

    return text.strip()

def write_text_to_files_by_metadata(df):
    # Ensure the dataframe has the required columns
    if not all(col in df.columns for col in ['document', 'metadata']):
        raise ValueError("Dataframe must contain 'document' and 'metadata' columns")

    os.makedirs(".txt/", exist_ok=True)
    
    # Dictionary to keep track of file handles
    file_handles = {}

    try:
        for _, row in df.iterrows():
            metadata = row['metadata']
            metadata = metadata.replace(" ", "_").replace("/", "_").replace("\\", "_").replace(":", "_").replace("\"", "")
            document = preprocess_content(row['document'])

            # Create or get file handle
            if metadata not in file_handles:
                filename = f".txt/{metadata}.txt"
                file_handles[metadata] = open(filename, 'a', encoding='utf-8')

            # Write document to file
            file_handles[metadata].write(document + "\n\n")  # Add two newlines for separation

    finally:
        # Close all file handles
        for handle in file_handles.values():
            handle.close()

    print(f"Files created: {', '.join(f'{metadata}.txt' for metadata in file_handles.keys())}")

In [23]:
write_text_to_files_by_metadata(df_doc_sample)


Files created: movie_30_Minutes_or_Less.txt, gutenberg_Up_the_Down_Staircase.txt, movie_Cobb_(film).txt, movie_The_Losers_(film).txt, gutenberg_The_Velveteen_Rabbit.txt, gutenberg_The_Two_Noble_Kinsmen.txt, movie_Rise_of_the_Guardians.txt, gutenberg_Honorine_(novel).txt, gutenberg_The_Adventure_of_the_Cardboard_Box.txt, movie_Yes_Man_(film).txt, movie_Fight_Club.txt


# Embedder into Milvus (GPU) for txts

In [8]:
data_processor.directory_files_process("narrative_qa_full_gpu", ".txt/", True, True)

aggregating documents:   1%|          | 12/1102 [00:00<00:19, 55.33it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   2%|▏         | 18/1102 [00:00<00:26, 41.06it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   3%|▎         | 32/1102 [00:00<00:20, 52.07it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   5%|▍         | 53/1102 [00:00<00:14, 72.21it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   7%|▋         | 76/1102 [00:01<00:11, 90.42it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:   9%|▊         | 96/1102 [00:01<00:13, 77.17it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  10%|▉         | 105/1102 [00:01<00:14, 67.73it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  11%|█▏        | 126/1102 [00:01<00:13, 71.26it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  13%|█▎        | 146/1102 [00:02<00:12, 74.13it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  14%|█▍        | 155/1102 [00:02<00:14, 64.52it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  16%|█▌        | 171/1102 [00:02<00:14, 63.37it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  16%|█▌        | 178/1102 [00:02<00:15, 59.08it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  18%|█▊        | 193/1102 [00:02<00:15, 59.73it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  19%|█▉        | 211/1102 [00:03<00:14, 63.30it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  20%|█▉        | 218/1102 [00:03<00:15, 56.40it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  21%|██▏       | 235/1102 [00:03<00:13, 66.62it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  23%|██▎       | 255/1102 [00:03<00:10, 77.78it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  25%|██▍       | 272/1102 [00:04<00:10, 77.96it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  27%|██▋       | 293/1102 [00:04<00:09, 84.57it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  29%|██▊       | 316/1102 [00:04<00:08, 91.09it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  30%|███       | 336/1102 [00:04<00:09, 80.27it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  31%|███▏      | 345/1102 [00:04<00:09, 79.89it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  34%|███▍      | 376/1102 [00:05<00:07, 91.62it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  36%|███▋      | 400/1102 [00:05<00:06, 102.20it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  37%|███▋      | 411/1102 [00:05<00:08, 85.17it/s] 

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  39%|███▉      | 430/1102 [00:05<00:08, 81.24it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  40%|███▉      | 439/1102 [00:05<00:08, 80.09it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  42%|████▏     | 459/1102 [00:06<00:08, 76.88it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  44%|████▍     | 483/1102 [00:06<00:07, 86.48it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  45%|████▌     | 498/1102 [00:06<00:06, 97.56it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  46%|████▌     | 508/1102 [00:06<00:06, 89.42it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  48%|████▊     | 526/1102 [00:07<00:08, 69.55it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  50%|█████     | 552/1102 [00:07<00:06, 89.23it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  52%|█████▏    | 569/1102 [00:07<00:04, 109.47it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  54%|█████▎    | 592/1102 [00:07<00:06, 81.62it/s] 

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  55%|█████▍    | 601/1102 [00:07<00:06, 80.73it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  56%|█████▌    | 618/1102 [00:08<00:07, 68.69it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  58%|█████▊    | 641/1102 [00:08<00:05, 86.98it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  60%|██████    | 663/1102 [00:08<00:04, 88.56it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  62%|██████▏   | 682/1102 [00:08<00:05, 76.37it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  64%|██████▎   | 701/1102 [00:09<00:05, 74.99it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  64%|██████▍   | 709/1102 [00:09<00:05, 68.17it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  65%|██████▌   | 721/1102 [00:09<00:04, 79.69it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  67%|██████▋   | 739/1102 [00:09<00:05, 67.84it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  68%|██████▊   | 747/1102 [00:09<00:05, 59.79it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  69%|██████▉   | 764/1102 [00:10<00:05, 62.53it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  71%|███████▏  | 787/1102 [00:10<00:04, 70.20it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  73%|███████▎  | 808/1102 [00:10<00:03, 80.31it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  75%|███████▍  | 826/1102 [00:10<00:03, 81.01it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  77%|███████▋  | 845/1102 [00:11<00:03, 79.99it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  77%|███████▋  | 854/1102 [00:11<00:03, 74.42it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  80%|███████▉  | 881/1102 [00:11<00:02, 81.94it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  82%|████████▏ | 904/1102 [00:11<00:02, 93.03it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  83%|████████▎ | 914/1102 [00:12<00:02, 73.40it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  85%|████████▍ | 934/1102 [00:12<00:02, 82.01it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  86%|████████▋ | 952/1102 [00:12<00:01, 83.50it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  87%|████████▋ | 961/1102 [00:12<00:02, 67.96it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  89%|████████▉ | 982/1102 [00:12<00:01, 76.51it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  91%|█████████ | 999/1102 [00:13<00:01, 64.99it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  92%|█████████▏| 1014/1102 [00:13<00:01, 62.18it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  94%|█████████▎| 1033/1102 [00:13<00:00, 74.40it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  96%|█████████▌| 1053/1102 [00:13<00:00, 81.28it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  96%|█████████▋| 1062/1102 [00:14<00:00, 78.46it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  98%|█████████▊| 1079/1102 [00:14<00:00, 72.55it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents:  99%|█████████▉| 1096/1102 [00:14<00:00, 63.63it/s]

Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.
Data preprocessing done.


aggregating documents: 100%|██████████| 1102/1102 [00:14<00:00, 75.03it/s]


Data preprocessing done.
Data preprocessing done.
Now splitting document...
Split 1102 documents into 810567 chunks.
Document split successfully.

Now fitting sparse embedder with new documents...
Sparse embedder fitted and saved successfully.

Now generate embeddings and storing them in Milvus...
Successfully created collection narrative_qa_full_gpu with dense dimension 1536 and sparse embeddings.
Successfully created indexes for collection narrative_qa_full_gpu.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Content: I...
Content: I...
Inserted data into Milvus.
Inserted data int

# Import GRAPH RAG data to neo4j

In [None]:
knowledge_DB = KnowledgeGraphDatabase()

In [None]:
knowledge_DB.transform_graph_rag_to_neo4j(datapath="../graph_rag_sample/output/20240906-153334/artifacts")

In [None]:
retriever = Retriever()
retriever.global_retrieve(0)

# Test Modular RAG

In [9]:
import pandas as pd
rag_evaluation_dataset = pd.read_parquet(".parquet/narrative_qa_qa_sample_11.parquet")
dataset_queries = rag_evaluation_dataset["questions"].tolist()[1::-1]
print(dataset_queries)
# print(vector_DB.list_collections())
answer = rag_evaluation_dataset["ground_truths"].tolist()[1::-1]
print(answer)




['Where does the fifth incarnation take place?', 'During the last decade what kind of life has Alexis Paulvitch lived?']
['New York New York ', 'A life of abuse and disease among tribal people. A life of abuse and disease. ']


In [1]:
from Module import *
from Config.output_pydantic import *
workflow = WorkFlowModularHybridRAG()

results = workflow.graph.stream({
    "specific_collection": "narrative_qa_standard_gpu",
    "user_query": "What is the meaning of life?",
    "all_results": [],
    "all_contexts": [],
    "repeat_times": 0,
})

for result in results:
    print(result)


* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed
2024-09-14 15:47:36,577 - 127728978470400 - milvus_client.py-milvus_client:658 - DEBUG: Created new connection using: 24895a66ca2a4ab083e169c30e1ccb62


Initializing sparse embedder...
Embedder initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
GraphDatabase initialized.
Retriever initialized
Agents initialized (with model: "gpt-4o-mini" and temperature: "0.1")
Tasks initialized
MultiAgent RAG System initialized
index =  0


TypeError: object of type 'NoneType' has no len()

In [1]:
from Module import *
from Config.output_pydantic import *
workflow = WorkFlowModularHybridRAG_Unit_Function_Test()

results = workflow.graph.stream({
    "specific_collection": "narrative_qa_full_gpu",
    "user_query": "What is the main topic of this dataset?",
})

for result in results:
    print(result)

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed
2024-09-14 16:46:03,839 - 128019998758400 - milvus_client.py-milvus_client:658 - DEBUG: Created new connection using: a7d42744e32e499fb6064d0256358b7a


Initializing sparse embedder...
Embedder initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
GraphDatabase initialized.
Retriever initialized
Agents initialized (with model: "gpt-4o-mini" and temperature: "0.1")
Tasks initialized
MultiAgent RAG System initialized
[1m[95m [2024-09-14 16:46:04][DEBUG]: == Working Agent: Query Classifier[00m
[1m[95m [2024-09-14 16:46:04][INFO]: == Starting Task: 
Analyze the following user query and determine if it requires information retrieval to be answered accurately, while also evaluating its global or local scope:

User Query: "What is the main topic of this dataset?"

Your task is to:
1. Classify this query as either requiring retrieval or not (Boolean value)
2. Evaluate the query's domain range score (Integer value from 0 to 100)
3. Provide a brief justification for your decision (String value)
4. Pick up the most relevant keywords or entities from the user query (List[str])

Consider the foll



[1m[92m [2024-09-14 16:46:06][DEBUG]: == [Query Classifier] Task output: needs_retrieval=True domain_range_score=81 justification='The query asks about the main topic of a specific dataset, which requires external information retrieval to accurately identify the content and context of that dataset.' relevant_keywords=['main topic', 'dataset']

[00m
{'user_query_classification_node': {'user_query_classification_result': UserQueryClassificationResult(needs_retrieval=True, domain_range_score=81, justification='The query asks about the main topic of a specific dataset, which requires external information retrieval to accurately identify the content and context of that dataset.', relevant_keywords=['main topic', 'dataset'])}}
[1m[95m [2024-09-14 16:46:06][DEBUG]: == Working Agent: Reranker[00m
[1m[95m [2024-09-14 16:46:06][INFO]: == Starting Task: 
Your task is to evaluate each community's relevance to the user's query or sub-queries relevant to the user's query.
User Query: "What i



[1m[92m [2024-09-14 16:46:09][DEBUG]: == [Reranker] Task output: relevant_scores=[60, 50, 40, 55, 45, 70, 30, 25, 35, 65]

[00m
[1m[95m [2024-09-14 16:46:09][DEBUG]: == Working Agent: Topic Searcher[00m
[1m[95m [2024-09-14 16:46:09][INFO]: == Starting Task: 
You have received multiple pieces of community information related to a user query or sub-queries decomposed from the user query by descending relevance scores.

----------------User Query----------------
What is the main topic of this dataset?

----------------Sub-queries----------------
[]

Your task is to analyze this information and help prepare for a vector database search to answer the user's query.

Follow these steps:
0. If no community information is provided, return 2 empty list.
1. Carefully read and analyze all provided community information.
2. Summarize the key points from this information and from the user query into concise community summaries.
3. Based on these summaries, imagine what relevant document chun



[1m[95m [2024-09-14 16:46:55][DEBUG]: == Working Agent: Reranker[00m
[1m[95m [2024-09-14 16:46:55][INFO]: == Starting Task: 
Your task is to evaluate each retrieved data item's relevance to the user's query or sub-queries relevant to the user's query.
-----User Query-----
"What is the main topic of this dataset?"

-----Sub-queries-----
"[]"

Your specific responsibilities are:
1. Compare each data item to the user's query and sub-queries.
2. Assign a relevance score to each data item based on how well it matches the user's query from 0 to 100.
   - Higher scores indicate better relevance.
3. Create a list of these relevance scores, don't include any other information.
4. CRITICAL: Ensure the number of scores in your output list EXACTLY matches the number of data items :10 in the input.

You will receive a list of 10 data items. 
----------------batch_retrieved_data----------------
[{'content': 'serve the turn as well. CHAPTER XXII IMARS TALE--WAR "That which I have always admired 



[1m[92m [2024-09-14 16:46:56][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 20, 15, 30, 25, 5, 10, 15, 20, 5]

[00m
[1m[92m [2024-09-14 16:46:57][DEBUG]: == [Reranker] Task output: relevance_scores=[20, 30, 10, 40, 50, 20, 15, 25, 5, 35]

[00m
[1m[92m [2024-09-14 16:46:57][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 20, 15, 25, 30, 5, 10, 15, 20, 25]

[00m
[1m[92m [2024-09-14 16:46:57][DEBUG]: == [Reranker] Task output: relevance_scores=[10, 15, 20, 25, 30, 35, 40, 45, 50, 55]

[00m
all_data_with_scores =  [('The community revolves around Teresa, a cake shop owner turned nurse, her fiancé Vincenzo, and Luigi, a hall-boy facing wrongful accusations. Their interconnected stories reflect themes of resilience, love, justice, and the impact of war on personal and community dynamics.', 95), ('This community explores the interconnected themes of Spring, Earth, and Bellona, emphasizing the cyclical nature of life, renewal, and the contrast between natural be



[32;1m[1;3mI now can give a great answer
Final Answer: 

**1. Community Dynamics and Relationships:**
   - **Teresa's Community:** "The community revolves around Teresa, a cake shop owner turned nurse, her fiancé Vincenzo, and Luigi, a hall-boy facing wrongful accusations. Their interconnected stories reflect themes of resilience, love, justice, and the impact of war on personal and community dynamics."
   - **Spring, Earth, and Bellona:** "This community explores the interconnected themes of Spring, Earth, and Bellona, emphasizing the cyclical nature of life, renewal, and the contrast between natural beauty and the harsh realities of war."
   - **Palamon, Arcite, and Emilia:** "The community centers around the intricate relationships between Palamon, Arcite, and Emilia, highlighting themes of love, rivalry, and honor."
   - **Carl's Journey:** "The community revolves around Carl, whose interactions with key entities like Beer, Nametag, and the Police Station reflect broader societal

# Test out Retriever for local retriever and global retriever

In [None]:
from Utils import *
retriever = Retriever()
knowledge_DB = KnowledgeGraphDatabase()
# knowledge_DB.create_entity_vector_index()
# knowledge_DB.create_community_weight()
global_result = retriever.global_retrieve(0)
local_result = retriever.local_retrieve(["What is the meaning of life"])

In [None]:
local_result

In [None]:
global_result["communities"]

In [None]:
import Config.constants as const
import json
from MultiAgent import *
from Utils import *

retriever = Retriever()
multi_agent = MultiAgent_RAG()
# all_communities = retriever.global_retrieve(0)["communities"]

# batches = []
# for i in range(0, len(all_communities), const.NODE_BATCH_SIZE):
#     batch_communities = all_communities[i:i + const.NODE_BATCH_SIZE]
#     batches.append({
#         "user_query": "What is the meaning of life",
#         "sub_queries": [],
#         "batch_communities": batch_communities,
#         "batch_size": len(batch_communities),
#     })


# all_scores = multi_agent.topic_reranking_run_batch_async(node_batch_inputs=batches).relevant_scores
# print(all_scores)
# print(len(all_scores))
# print(len(all_communities))

multi_agent.user_query_classification_run(user_query="Why does the author choose to use first-person point of view in this article?")



