In [1]:
import base64, gzip, json, os, re, sys
import text_utils as tu
import eval_utils as eu
import pandas as pd
from collections import Counter
from pprint import pprint
from json.decoder import JSONDecodeError
from absl import app
from absl import flags
from bs4 import BeautifulSoup


import jinja2
import numpy as np

# can be repeated on training dataset
dataset = "dev"
DATA_DIR = os.path.join(os.path.expanduser("~"), "data", "v1.0", f"{dataset}")

In [2]:
def read_json(file_path):
    """
    Robustly read and parse a JSON or gzipped JSON file with multiple JSON objects.
    
    Args:
        file_path (str): Path to the JSON or gzipped JSON file
    
    Returns:
        list: List of parsed JSON objects
    """
    parsed_objects = []
    
    # Determine file opening method based on extension
    open_method = gzip.open if file_path.endswith('.gz') else open
    mode = 'rt' if file_path.endswith('.gz') else 'r'
    
    with open_method(file_path, mode, encoding='utf-8') as file:
        # Read the entire file content
        content = file.read()
    
    # Split the content into lines or use a streaming approach
    lines = content.splitlines()
    
    for line in lines:
        try:
            # Try to parse each line as a separate JSON object
            parsed_object = json.loads(line.strip())
            parsed_objects.append(parsed_object)
        except JSONDecodeError:
            # If line parsing fails, try parsing entire content as a single JSON
            if not parsed_objects:
                try:
                    parsed_objects = json.loads(content)
                    break
                except JSONDecodeError:
                    continue
    
    # If no objects parsed, return an empty list
    return parsed_objects if parsed_objects else []

# dict_keys(['annotations', 'document_html', 'document_title', 'document_tokens', 'document_url', 'example_id', 'long_answer_candidates', 'question_text', 'question_tokens'])
#dev_nq = read_json('nq-dev-sample.jsonl.gz')
#dev_nq_simplified = read_json('simplified-nq-dev-sample.jsonl.gz')

In [3]:
import gzip
import json
from pathlib import Path

def read_simplified_nq(data_dir, dataset="dev", start_file=0, end_file=None):
    """
    Read simplified Natural Questions data from gzipped JSON files.
    
    Args:
        data_dir: Directory containing the simplified files
        dataset: 'train' or 'dev'
        start_file: Starting file number (e.g., 0 for nq-{dataset}-00)
        end_file: Ending file number (None to read all files)
    
    Yields:
        dict: Each simplified NQ example
    """
    pattern = f"simplified-nq-{dataset}-??.jsonl.gz"
    files = sorted(Path(data_dir).glob(pattern))
    
    # Extract file number using string operations
    def get_file_num(filepath):
        # Get the two digits before .jsonl.gz
        return int(filepath.name.split('.')[0][-2:])
    
    # Filter files based on start/end numbers if specified
    if end_file is not None:
        files = [f for f in files if get_file_num(f) <= end_file]
    files = [f for f in files if get_file_num(f) >= start_file]
    
    for file in files:
        print(f"Reading {file.name}")
        with gzip.open(file, 'rt', encoding='utf-8') as f:
            for line in f:
                yield json.loads(line.strip())



In [4]:
def get_answer_text(simplified_example):
    """
    Gets all answer texts from a simplified NQ example without annotator agreement requirements.
    
    Args:
        simplified_example: Output from simplify_nq_example()
    
    Returns:
        Dictionary containing answer texts and metadata
    """
    tokens = tu.get_nq_tokens(simplified_example)
    
    result = {
        'example_id': simplified_example['example_id'],
        'question': simplified_example['question_text'],
        'document_title': simplified_example['document_title'],
        'long_answers': [],
        'short_answers': [],
        'yes_no_answers': []
    }
    
    for annotation in simplified_example['annotations']:
        # Get long answer
        long_answer = annotation['long_answer']
        if long_answer['start_token'] != -1:
            text = ' '.join(tokens[long_answer['start_token']:long_answer['end_token']])
            if text not in result['long_answers']:
                result['long_answers'].append(text)
        
        # Get short answers
        for short_answer in annotation['short_answers']:
            text = ' '.join(tokens[short_answer['start_token']:short_answer['end_token']])
            if text not in result['short_answers']:
                result['short_answers'].append(text)
        
        # Get yes/no answer
        if 'yes_no_answer' in annotation and annotation['yes_no_answer'] != 'NONE':
            result['yes_no_answers'].append(annotation['yes_no_answer'])
    
    return result

In [5]:
simplified_nqex = read_simplified_nq(DATA_DIR)

In [11]:
def convert_spans_to_dict(obj):
    """Convert Span objects and other non-serializable objects to dictionaries."""
    if hasattr(obj, '__dict__'):
        # Convert custom objects to their dictionary representation
        return {k: convert_spans_to_dict(v) for k, v in obj.__dict__.items()}
    elif isinstance(obj, (list, tuple)):
        return [convert_spans_to_dict(item) for item in obj]
    elif isinstance(obj, dict):
        return {k: convert_spans_to_dict(v) for k, v in obj.items()}
    else:
        return obj

In [None]:
def process_and_combine(file_path):
    """
    Process a single simplified NQ file and combine with annotations and answer texts
    """
    combined_data = []
    
    # Get annotations for this file
    annotations = eu.read_annotation(str(file_path))
    
    # Read simplified examples
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            example = json.loads(line)
            example_id = example['example_id']
            
            # Add annotations if they exist eu.gold_has_short_answer(dev_nq_eval[e_id_test])
            if example_id in annotations:
                example['gold_has_long_answer'] = eu.gold_has_long_answer(annotations[example_id])
                example['gold_has_short_answer'] = eu.gold_has_short_answer(annotations[example_id])
                
            
            # Get answer text
            answer_text = get_answer_text(example)
            example['processed_answers'] = {
                'long_answers': answer_text['long_answers'],
                'short_answers': answer_text['short_answers'],
                'yes_no_answers': answer_text['yes_no_answers']
            }
            
            combined_data.append(example)
    
    # Save combined data to new file
    output_path = str(file_path).replace('simplified-', 'combined-')
    with gzip.open(output_path, 'wt', encoding='utf-8') as f:
        for item in combined_data:
            try:
                f.write(json.dumps(item) + '\n')
            except TypeError as e:
                print(f"Error serializing item: {e}")
                # Optionally print problematic keys
                for k, v in item.items():
                    try:
                        json.dumps(v)
                    except TypeError:
                        print(f"Problem with key: {k}, type: {type(v)}")
    
    return output_path, len(combined_data)

# Process all files with progress tracking
pattern = f"simplified-nq-{dataset}-??.jsonl.gz"
files = sorted(Path(DATA_DIR).glob(pattern))
total_processed = 0

for file in files:
    print(f"\nProcessing {file}")
    output_file, num_processed = process_and_combine(file)
    total_processed += num_processed
    print(f"Processed {num_processed} examples")
    print(f"Saved combined data to {output_file}")

print(f"\nTotal examples processed: {total_processed}")

In [6]:
# Example usage:
original_file = Path(DATA_DIR) / f"simplified-nq-{dataset}-00.jsonl.gz"
simplified_file = Path(DATA_DIR) / f"simplified-nq-{dataset}-00.jsonl.gz"
combined_file = Path(DATA_DIR) / f"combined-nq-{dataset}-00.jsonl.gz"

In [7]:
nq_og = read_json(str(original_file))
nq_simplified = read_json(str(simplified_file))
nq_combined = read_json(str(combined_file))

In [None]:
nq_og[1]

In [None]:
nq_simplified[1]

In [None]:
nq_combined[0].keys()

In [None]:
def inspect_files(original_file, combined_file, num_examples=1):
    """
    Compare original simplified file with combined file
    
    Args:
        original_file: Path to simplified NQ file
        combined_file: Path to combined file with annotations and answers
        num_examples: Number of examples to display
    """
    print(f"Reading from:\n  Original: {original_file}\n  Combined: {combined_file}\n")
    
    # Read examples from both files
    with gzip.open(original_file, 'rt', encoding='utf-8') as f:
        original = [json.loads(line) for line in f][:num_examples]
        
    with gzip.open(combined_file, 'rt', encoding='utf-8') as f:
        combined = [json.loads(line) for line in f][:num_examples]
    
    # Compare and display
    for i, (orig, comb) in enumerate(zip(original, combined)):
        print(f"\nExample {i+1}:")
        print("\nOriginal keys:", list(orig.keys()))
        print("Combined keys:", list(comb.keys()))
        
        # Show what was added
        new_keys = set(comb.keys()) - set(orig.keys())
        print("\nNewly added fields:", new_keys)
        
        # Display some key fields
        print(f"\nQuestion: {comb['question_text']}")
        print("\nProcessed Answers:")
        print(json.dumps(comb['processed_answers'], indent=2))
        
        if 'annotation_eval' in comb:
            print("\nAnnotation Eval:")
            print(json.dumps(comb['annotation_eval'], indent=2))

# Example usage:
original_file = Path(DATA_DIR) / f"simplified-nq-{dataset}-00.jsonl.gz"
combined_file = Path(DATA_DIR) / f"combined-nq-{dataset}-00.jsonl.gz"
inspect_files(original_file, combined_file, num_examples=100)

In [7]:
def read_json(file_path):
    """
    Robustly read and parse a JSON or gzipped JSON file with multiple JSON objects.
    
    Args:
        file_path (str): Path to the JSON or gzipped JSON file
    
    Returns:
        list: List of parsed JSON objects
    """
    parsed_objects = []
    
    # Determine file opening method based on extension
    open_method = gzip.open if file_path.endswith('.gz') else open
    mode = 'rt' if file_path.endswith('.gz') else 'r'
    
    with open_method(file_path, mode, encoding='utf-8') as file:
        # Read the entire file content
        content = file.read()
    
    # Split the content into lines or use a streaming approach
    lines = content.splitlines()
    
    for line in lines:
        try:
            # Try to parse each line as a separate JSON object
            parsed_object = json.loads(line.strip())
            parsed_objects.append(parsed_object)
        except JSONDecodeError:
            # If line parsing fails, try parsing entire content as a single JSON
            if not parsed_objects:
                try:
                    parsed_objects = json.loads(content)
                    break
                except JSONDecodeError:
                    continue
    
    # If no objects parsed, return an empty list
    return parsed_objects if parsed_objects else []

# dict_keys(['annotations', 'document_html', 'document_title', 'document_tokens', 'document_url', 'example_id', 'long_answer_candidates', 'question_text', 'question_tokens'])
dev_nq = read_json('nq-dev-sample.jsonl.gz')
dev_nq_simplified = read_json('simplified-nq-dev-sample.jsonl.gz')

In [None]:
def analyze_answer_distribution(data_dir, dataset):
    """
    Count examples with different combinations of answer types
    """
    pattern = f"combined-nq-{dataset}-??.jsonl.gz"
    files = sorted(Path(data_dir).glob(pattern))
    
    counts = {
        'total': 0,
        'has_long': 0,
        'has_short': 0,
        'has_both': 0,
        'has_neither': 0,
        'only_long': 0,
        'only_short': 0
    }
    
    for file in files:
        print(f"Processing {file}")
        with gzip.open(file, 'rt', encoding='utf-8') as f:
            for line in f:
                example = json.loads(line)
                answers = example['processed_answers']
                
                has_long = len(answers['long_answers']) > 0
                has_short = len(answers['short_answers']) > 0
                
                counts['total'] += 1
                if has_long: counts['has_long'] += 1
                if has_short: counts['has_short'] += 1
                if has_long and has_short: counts['has_both'] += 1
                if not has_long and not has_short: counts['has_neither'] += 1
                if has_long and not has_short: counts['only_long'] += 1
                if has_short and not has_long: counts['only_short'] += 1
    
    # Print results
    print("\nAnswer Distribution:")
    print(f"Total examples: {counts['total']}")
    print(f"Examples with long answers: {counts['has_long']} ({counts['has_long']/counts['total']*100:.1f}%)")
    print(f"Examples with short answers: {counts['has_short']} ({counts['has_short']/counts['total']*100:.1f}%)")
    print(f"Examples with both types: {counts['has_both']} ({counts['has_both']/counts['total']*100:.1f}%)")
    print(f"Examples with neither type: {counts['has_neither']} ({counts['has_neither']/counts['total']*100:.1f}%)")
    print(f"Examples with only long answers: {counts['only_long']} ({counts['only_long']/counts['total']*100:.1f}%)")
    print(f"Examples with only short answers: {counts['only_short']} ({counts['only_short']/counts['total']*100:.1f}%)")
    
    return counts

# Use it
stats = analyze_answer_distribution(DATA_DIR, dataset)

In [3]:
e_id_test = simplified_dev_nq[0]['example_id']

In [4]:
# scores of zero by default, still should use gold_has_X_answer functions for each example_id
dev_nq_eval = eu.read_annotation('nq-dev-sample.jsonl.gz')

In [None]:
# each example_id has a 5 element list for each annotation with bytes/tokens to each short and/or long answer
dev_nq_eval[e_id_test]

In [None]:
eu.gold_has_short_answer(dev_nq_eval[e_id_test])

In [None]:
eu.gold_has_long_answer(dev_nq_eval[e_id_test])

In [None]:
# each example_id is a key for an element of dev_nq
# a dict with the keys: (['question_text', 'example_id', 'document_url', 'document_text', 'long_answer_candidates', 'annotations'])
# long answer candidates is a list of dicts with keys: ['start_token', 'top_level', 'end_token']
# annotations is a list of dicts with keys: ['annotation_id', 'yes_no_answer', 'long_answer', 'short_answers']
# iterate over annotations's long_answer and short_answers to get the start and end tokens
simplified_example = tu.simplify_nq_example(dev_nq[0])  # Simplify first example
simplified_example

In [12]:
def get_simple_answer_text(simplified_example):
    """
    Gets all answer texts from a simplified NQ example without annotator agreement requirements.
    
    Args:
        simplified_example: Output from simplify_nq_example()
    
    Returns:
        Dictionary containing answer texts and metadata
    """
    tokens = tu.get_nq_tokens(simplified_example)
    
    result = {
        'example_id': simplified_example['example_id'],
        'question': simplified_example['question_text'],
        'document_title': simplified_example['document_title'],
        'long_answers': [],
        'short_answers': [],
        'yes_no_answers': []
    }
    
    for annotation in simplified_example['annotations']:
        # Get long answer
        long_answer = annotation['long_answer']
        if long_answer['start_token'] != -1:
            text = ' '.join(tokens[long_answer['start_token']:long_answer['end_token']])
            if text not in result['long_answers']:
                result['long_answers'].append(text)
        
        # Get short answers
        for short_answer in annotation['short_answers']:
            text = ' '.join(tokens[short_answer['start_token']:short_answer['end_token']])
            if text not in result['short_answers']:
                result['short_answers'].append(text)
        
        # Get yes/no answer
        if 'yes_no_answer' in annotation and annotation['yes_no_answer'] != 'NONE':
            result['yes_no_answers'].append(annotation['yes_no_answer'])
    
    return result

In [None]:
answer_text_test = get_simple_answer_text(example)
answer_text_test

In [None]:
answer_texts = get_simple_answer_text(simplified_example)
answer_texts

In [None]:
nq_label_examples = []

for a in simplified_example['annotations']:
    e_id = simplified_example['example_id']
    simple_span = eu.Span(-1, -1, a['long_anser']['start_token'], a['long_answer']['end_token'])
    

In [None]:


print(f"Question: {answer_texts['question']}")
if answer_texts['long_answer_text']:
    print(f"\nLong answer: {answer_texts['long_answer_text']}")
if answer_texts['short_answers_text']:
    print(f"\nShort answers: {', '.join(answer_texts['short_answers_text'])}")
if answer_texts['yes_no_answer'] != 'NONE':
    print(f"\nYes/No answer: {answer_texts['yes_no_answer']}")

In [None]:
def create_nq_json(examples, output_file='processed_nq_data.json'):
    """Convert NQ examples into JSON file with validation statistics."""
    processed_data = []
    validation_stats = {
        'total_examples': 0,
        'examples_with_both_answers': 0,
        'examples_with_valid_containment': 0,
        'problematic_examples': []
    }
    
    for example in examples:
        try:
            processed = process_nq_example(example)
            
            # Track validation statistics
            validation_stats['total_examples'] += 1
            if processed['has_long_answer'] and processed['has_short_answer']:
                validation_stats['examples_with_both_answers'] += 1
                if processed['validation']['all_short_answers_in_long']:
                    validation_stats['examples_with_valid_containment'] += 1
                else:
                    validation_stats['problematic_examples'].append({
                        'example_id': processed['example_id'],
                        'question': processed['question'],
                        'problematic_short_answers': processed['validation']['problematic_short_answers']
                    })
            
            processed_data.append(processed)
            
        except Exception as e:
            print(f"Error processing example: {str(e)}")
            continue
    
    # Print validation summary
    print("\nValidation Summary:")
    print(f"Total examples processed: {validation_stats['total_examples']}")
    print(f"Examples with both long and short answers: {validation_stats['examples_with_both_answers']}")
    print(f"Examples with valid containment: {validation_stats['examples_with_valid_containment']}")
    print(f"Number of problematic examples: {len(validation_stats['problematic_examples'])}")
    
    if validation_stats['problematic_examples']:
        print("\nSample of problematic examples:")
        for example in validation_stats['problematic_examples'][:5]:  # Show first 5
            print(f"\nQuestion: {example['question']}")
            print(f"Problematic short answers: {example['problematic_short_answers']}")
    
    # Write to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({
            'data': processed_data,
            'validation_stats': validation_stats
        }, f, ensure_ascii=False, indent=2)
    
    return processed_data, validation_stats

# Usage:
processed_data, validation_stats = create_nq_json(dev_nq)