# Load dataset from NarrativeQA

We use Narrative from Question Generation and RAG Evaluation

In [1]:
from Utils import *
data_loader = DatasetLoader()
from datasets import load_dataset
import pandas as pd

  from tqdm.autonotebook import tqdm, trange


DatasetLoads initialized


In [2]:
import pandas as pd
from datasets import load_dataset

def extract_narrativeqa_text(split='train'):
    # Load the dataset
    dataset = load_dataset("deepmind/narrativeqa", split=split)
    
    # Dictionaries to store unique texts
    unique_summaries = {}
    unique_documents = {}
    
    total_summary_chars = 0
    total_document_chars = 0
    
    # Extract text from each example
    for example in dataset:
        summary = example['document']['summary']['text']
        document = example['document']['text']
        metadata = example['document']['kind']
        
        # Only add if both summary and document are unique
        if summary not in unique_summaries and document not in unique_documents:
            unique_summaries[summary] = metadata
            unique_documents[document] = metadata
            total_summary_chars += len(summary)
            total_document_chars += len(document)
    
    # Create lists from the dictionaries
    summaries = list(unique_summaries.keys())
    documents = list(unique_documents.keys())
    metadata = [unique_summaries[s] for s in summaries]  # align metadata with summaries
    
    # Calculate averages
    num_examples = len(summaries)
    avg_summary_chars = total_summary_chars / num_examples if num_examples > 0 else 0
    avg_document_chars = total_document_chars / num_examples if num_examples > 0 else 0
    
    # Create a DataFrame
    df = pd.DataFrame({
        'summary': summaries,
        'document': documents,
        'metadata': metadata
    })
    
    print(f'Number of unique examples: {num_examples}')
    print(f'Average summary length: {avg_summary_chars:.2f} characters')
    print(f'Average document length: {avg_document_chars:.2f} characters')
    
    return df

# Example usage
# df = extract_narrativeqa_text(split='train')

In [3]:
df_text = extract_narrativeqa_text(split="train")
print(len(df_text))
print(len(df_text["summary"][0]), len(df_text["document"][0]))

Number of unique examples: 1102
Average summary length: 3392.44 characters
Average document length: 343771.38 characters
1102
5098 798807


In [4]:
random_sample_text = df_text.sample(frac=0.05)

In [5]:
random_sample_text[:5]

Unnamed: 0,summary,document,metadata
1095,The narrative begins with the formation of th...,ï»¿The Project Gutenberg EBook of The American...,gutenberg
99,"Four days after the events of Rush Hour, LAPD...",<html>\n<head><title>Rush Hour 2 Script at IMS...,movie
358,"The first story, ""The Blonde Lady"", opens wit...",ï»¿Project Gutenberg's ArsÃ¨ne Lupin versus He...,gutenberg
320,Junior risk analyst Seth Bregman (Penn Badgle...,<html>\n<head><title>Margin Call Script at IMS...,movie
550,"The film takes place in 1936, at the height o...",<html>\n<head>\n<script>\n<!--\n\n/*\nBreak-ou...,movie


In [10]:
len(random_sample_text)

55

# Transform the dataframe into .txts

In [8]:
def write_text_to_files_by_metadata(df):
    # Ensure the dataframe has the required columns
    if not all(col in df.columns for col in ['document', 'metadata']):
        raise ValueError("Dataframe must contain 'document' and 'metadata' columns")

    # Dictionary to keep track of file handles
    file_handles = {}

    try:
        for _, row in df.iterrows():
            metadata = row['metadata']
            document = row['document']

            # Create or get file handle
            if metadata not in file_handles:
                filename = f".txt/{metadata}.txt"
                file_handles[metadata] = open(filename, 'a', encoding='utf-8')

            # Write document to file
            file_handles[metadata].write(document + "\n\n")  # Add two newlines for separation

    finally:
        # Close all file handles
        for handle in file_handles.values():
            handle.close()

    print(f"Files created: {', '.join(f'{metadata}.txt' for metadata in file_handles.keys())}")

In [9]:
write_text_to_files_by_metadata(random_sample_text)

Files created: gutenberg.txt, movie.txt


# Embedder into Milvus (GPU) for normal dataframe

In [None]:
vector_DB = VectorDatabase()
embedder = Embedder()
data_processor = DataProcessor(embedder=embedder, vectordatabase=vector_DB)

Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
Initializing sparse embedder...
Embedder initialized
Data Processor initialized


[nltk_data] Downloading package words to /home/yarikama/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [8]:
from langchain.schema.document import Document

def transform_to_langchain_documents(df):
    """
    Transform the DataFrame into a list of Langchain Document objects.
    
    Args:
    df (pandas.DataFrame): DataFrame with 'document', 'summary', and 'metadata' columns.
    
    Returns:
    list: A list of Langchain Document objects.
    """
    documents = []
    for _, row in df.iterrows():
        doc = Document(
            page_content=row['document'],
            metadata={
                "kind": row['metadata'],
            }
        )
        documents.append(doc)
    return documents


In [9]:
docs = transform_to_langchain_documents(random_sample_text)

ValidationError: 1 validation error for Document
metadata
  value is not a valid dict (type=type_error.dict)

# Embedder into Milvus (GPU) for txts

In [None]:
data_processor.directory_files_process("routing_narrativeqa", ".txt/", True, True)