# Load dataset from NarrativeQA

We use Narrative from Question Generation and RAG Evaluation

In [1]:
from Utils import *
data_loader = DatasetLoader()
from datasets import load_dataset
import pandas as pd

vector_DB = VectorDatabase()
embedder = Embedder()
data_processor = DataProcessor(embedder=embedder, vectordatabase=vector_DB)

DatasetLoads initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
Initializing sparse embedder...
Embedder initialized
Data Processor initialized


[nltk_data] Downloading package words to /home/yarikama/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:

def extract_narrativeqa_text(split='train'):
    # Load the dataset
    dataset = load_dataset("deepmind/narrativeqa", split=split)
    
    # Dictionaries to store unique texts
    unique_summaries = {}
    unique_documents = {}
    
    total_summary_chars = 0
    total_document_chars = 0
    
    # Extract text from each example
    for example in dataset:
        summary = example['document']['summary']['text']
        document = example['document']['text']
        metadata = example['document']['kind']
        
        # Only add if both summary and document are unique
        if summary not in unique_summaries and document not in unique_documents:
            unique_summaries[summary] = metadata
            unique_documents[document] = metadata
            total_summary_chars += len(summary)
            total_document_chars += len(document)
    
    # Create lists from the dictionaries
    summaries = list(unique_summaries.keys())
    documents = list(unique_documents.keys())
    metadata = [unique_summaries[s] for s in summaries]  # align metadata with summaries
    
    # Calculate averages
    num_examples = len(summaries)
    avg_summary_chars = total_summary_chars / num_examples if num_examples > 0 else 0
    avg_document_chars = total_document_chars / num_examples if num_examples > 0 else 0
    
    # Create a DataFrame
    df = pd.DataFrame({
        'summary': summaries,
        'document': documents,
        'metadata': metadata
    })
    
    print(f'Number of unique examples: {num_examples}')
    print(f'Average summary length: {avg_summary_chars:.2f} characters')
    print(f'Average document length: {avg_document_chars:.2f} characters')
    
    return df

# Example usage
# df = extract_narrativeqa_text(split='train')

In [None]:
df_text = extract_narrativeqa_text(split="train")
print(len(df_text))
print(len(df_text["summary"][0]), len(df_text["document"][0]))

In [None]:
len(df_text)

In [None]:
random_sample_text = df_text.sample(frac=0.05)
len(random_sample_text)

# Transform the dataframe into .txts

In [None]:
def write_text_to_files_by_metadata(df):
    # Ensure the dataframe has the required columns
    if not all(col in df.columns for col in ['document', 'metadata']):
        raise ValueError("Dataframe must contain 'document' and 'metadata' columns")

    # Dictionary to keep track of file handles
    file_handles = {}

    try:
        for _, row in df.iterrows():
            metadata = row['metadata']
            document = row['document']

            # Create or get file handle
            if metadata not in file_handles:
                filename = f".txt/{metadata}.txt"
                file_handles[metadata] = open(filename, 'a', encoding='utf-8')

            # Write document to file
            file_handles[metadata].write(document + "\n\n")  # Add two newlines for separation

    finally:
        # Close all file handles
        for handle in file_handles.values():
            handle.close()

    print(f"Files created: {', '.join(f'{metadata}.txt' for metadata in file_handles.keys())}")

In [None]:
write_text_to_files_by_metadata(df_text)

# Embedder into Milvus (GPU) for normal dataframe

In [None]:
from langchain.schema.document import Document

def transform_to_langchain_documents(df):
    """
    Transform the DataFrame into a list of Langchain Document objects.
    
    Args:
    df (pandas.DataFrame): DataFrame with 'document', 'summary', and 'metadata' columns.
    
    Returns:
    list: A list of Langchain Document objects.
    """
    documents = []
    for _, row in df.iterrows():
        doc = Document(
            page_content=row['document'],
            metadata={
                "kind": row['metadata'],
            }
        )
        documents.append(doc)
    return documents


In [None]:
docs = transform_to_langchain_documents(random_sample_text)

# Embedder into Milvus (GPU) for txts

In [2]:
data_processor.directory_files_process("narrative_qa_standard_gpu", ".txt/", True, True)

aggregating documents: 100%|██████████| 2/2 [00:00<00:00,  5.05it/s]


Now splitting document...
Split 2 documents into 978866 chunks.
Document split successfully.

Now fitting sparse embedder with new documents...
Sparse embedder fitted and saved successfully.

Now generate embeddings and storing them in Milvus...
Successfully created collection narrative_qa_standard_gpu with dense dimension 1536 and sparse embeddings.
Successfully created indexes for collection narrative_qa_standard_gpu.
Inserted data into Milvus.
Content: I...
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Content: *       *       *       *       *...
Content: *       *       *       *       *...
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Inserted data into Milvus.
Content: *       *       *       *       *...
Inserted data into Milvus.
Inserted data into 

# Embedding test

In [4]:
vector_DB.list_collections()
vector_DB.drop_collection("narrative_qa_standard_gpu")

Collection narrative_qa_standard_gpu dropped.


In [5]:
vector_DB.list_collections()


['narrative_test',
 'narrative_test_cpu',
 'hotpot_qa',
 'alice2',
 'alice',
 'squad']