# Load dataset from NarrativeQA

We use Narrative from Question Generation and RAG Evaluation

In [2]:
from Utils import *
data_loader = DatasetLoader()
from datasets import load_dataset
import pandas as pd

vector_DB = VectorDatabase()
embedder = Embedder()
data_processor = DataProcessor(embedder=embedder, vectordatabase=vector_DB)

DatasetLoads initialized
Connected to Milvus at localhost:19530 with database default.
VectorDatabase initialized.
Initializing sparse embedder...
Embedder initialized
Data Processor initialized


[nltk_data] Downloading package words to /home/yarikama/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [3]:

def extract_narrativeqa_text(split='train'):
    # Load the dataset
    dataset = load_dataset("deepmind/narrativeqa", split=split)
    
    # Dictionaries to store unique texts
    unique_summaries = {}
    unique_documents = {}
    
    total_summary_chars = 0
    total_document_chars = 0
    
    # Extract text from each example
    for example in dataset:
        summary = example['document']['summary']['text']
        document = example['document']['text']
        metadata = example['document']['kind']
        
        # Only add if both summary and document are unique
        if summary not in unique_summaries and document not in unique_documents:
            unique_summaries[summary] = metadata
            unique_documents[document] = metadata
            total_summary_chars += len(summary)
            total_document_chars += len(document)
    
    # Create lists from the dictionaries
    summaries = list(unique_summaries.keys())
    documents = list(unique_documents.keys())
    metadata = [unique_summaries[s] for s in summaries]  # align metadata with summaries
    
    # Calculate averages
    num_examples = len(summaries)
    avg_summary_chars = total_summary_chars / num_examples if num_examples > 0 else 0
    avg_document_chars = total_document_chars / num_examples if num_examples > 0 else 0
    
    # Create a DataFrame
    df = pd.DataFrame({
        'summary': summaries,
        'document': documents,
        'metadata': metadata
    })
    
    print(f'Number of unique examples: {num_examples}')
    print(f'Average summary length: {avg_summary_chars:.2f} characters')
    print(f'Average document length: {avg_document_chars:.2f} characters')
    
    return df

# Example usage
# df = extract_narrativeqa_text(split='train')

In [4]:
df_text = extract_narrativeqa_text(split="train")
print(len(df_text))
print(len(df_text["summary"][0]), len(df_text["document"][0]))

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

Number of unique examples: 1102
Average summary length: 3392.44 characters
Average document length: 343771.38 characters
1102
5098 798807


In [5]:
len(df_text)

1102

In [8]:
random_sample_text = df_text.sample(frac=0.01)
# random_sample_text.to_parquet(".parquet/narrative_qa_sample_11.parquet")


In [11]:
# load parquet
samples = pd.read_parquet(".parquet/narrative_qa_sample_11.parquet")
samples


Unnamed: 0,summary,document,metadata
4,"""Crash"" Davis (Costner), a veteran of 12 year...",<html>\n<head><title>Bull Durham Script at IMS...,movie
156,The Vicar - Dr Charles Primrose - lives an id...,ï»¿The Project Gutenberg EBook of The Vicar of...,gutenberg
805,At a bistro in the Montmartre district of Par...,<html>\n<head><title>Ronin Script at IMSDb.</t...,movie
492,According to The Oxford Companion to English ...,"ï»¿The Project Gutenberg EBook of Adam Bede, b...",gutenberg
713,"In the woods outside of Cherry Falls, Virgini...",<html>\n<head><title>Cherry Falls Script at IM...,movie
599,Tom Swift's father has been working diligentl...,ï»¿Project Gutenberg's Tom Swift and his Subma...,gutenberg
318,"The novel, which is intensely autobiographica...",ï»¿The Project Gutenberg EBook of The Book of ...,gutenberg
19,"In the winter of 1987, Minneapolis car salesm...",<html>\n<head><title>Fargo Script at IMSDb.</t...,movie
617,A wealthy American man named Longmore is intr...,ï»¿The Project Gutenberg EBook of Madame de Ma...,gutenberg
472,Trevor Gooden (Dean Winters) survives a car a...,<html>\n<head><title>Hellraiser: Hellseeker Sc...,movie


# Transform the dataframe into .txts

In [23]:

import re
from bs4 import BeautifulSoup
import unicodedata

def preprocess_content(content: str) -> str:
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.get_text()

    # 統一為 NFKC 正規化形式
    text = unicodedata.normalize('NFKC', text)

    # 移除 URL
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # 移除多餘的空白字符
    text = re.sub(r'[ \t]+', ' ', text).strip()

    # 移除特殊字符，但保留某些標點符號
    text = re.sub(r'[^\w\s.,!?;:()"-]', '', text)

    # 統一引號
    text = text.replace('"', '"').replace('"', '"')

    # 移除連續的標點符號
    text = re.sub(r'([.,!?;:])\1+', r'\1', text)

    # 確保句子之間有適當的空格
    text = re.sub(r'([.,!?;:])\s*', r'\1 ', text)

    return text.strip()

def write_text_to_files_by_metadata(df):
    # Ensure the dataframe has the required columns
    if not all(col in df.columns for col in ['document', 'metadata']):
        raise ValueError("Dataframe must contain 'document' and 'metadata' columns")

    # Dictionary to keep track of file handles
    file_handles = {}

    try:
        for _, row in df.iterrows():
            metadata = row['metadata']
            document = preprocess_content(row['document'])

            # Create or get file handle
            if metadata not in file_handles:
                filename = f".txt/{metadata}.txt"
                file_handles[metadata] = open(filename, 'a', encoding='utf-8')

            # Write document to file
            file_handles[metadata].write(document + "\n\n")  # Add two newlines for separation

    finally:
        # Close all file handles
        for handle in file_handles.values():
            handle.close()

    print(f"Files created: {', '.join(f'{metadata}.txt' for metadata in file_handles.keys())}")

In [24]:
write_text_to_files_by_metadata(samples)

Files created: movie.txt, gutenberg.txt


# Embedder into Milvus (GPU) for normal dataframe

In [None]:
from langchain.schema.document import Document

def transform_to_langchain_documents(df):
    """
    Transform the DataFrame into a list of Langchain Document objects.
    
    Args:
    df (pandas.DataFrame): DataFrame with 'document', 'summary', and 'metadata' columns.
    
    Returns:
    list: A list of Langchain Document objects.
    """
    documents = []
    for _, row in df.iterrows():
        doc = Document(
            page_content=row['document'],
            metadata={
                "kind": row['metadata'],
            }
        )
        documents.append(doc)
    return documents


In [None]:
docs = transform_to_langchain_documents(random_sample_text)

# Embedder into Milvus (GPU) for txts

In [None]:
data_processor.directory_files_process("narrative_qa_standard_gpu", ".txt/", True, True)