In [1]:
import base64, gzip, json, os, re, sys
from json.decoder import JSONDecodeError

# can be repeated on training dataset
dataset = "dev"
DATA_DIR = os.path.join(os.path.expanduser("~"), "data", "chroniclingamericaQA")

In [30]:
def load_ca_data(dataset):

    parsed_objects = []
    file_path = f"{DATA_DIR}/{dataset}.json"
    # Determine file opening method based on extension
    open_method = gzip.open if file_path.endswith('.gz') else open
    mode = 'rt' if file_path.endswith('.gz') else 'r'
    with open_method(file_path, mode, encoding='utf-8') as file:
        # Read the entire file content
        content = file.read()
    # Split the content into lines or use a streaming approach
    lines = content.splitlines()
    for line in lines:
        try:
            # Try to parse each line as a separate JSON object
            parsed_object = json.loads(line.strip())
            parsed_objects.append(parsed_object)
        except JSONDecodeError:
            # If line parsing fails, try parsing entire content as a single JSON
            if not parsed_objects:
                try:
                    parsed_objects = json.loads(content)
                    break
                except JSONDecodeError:
                    continue
    # If no objects parsed, return an empty list
    return parsed_objects if parsed_objects else []

In [31]:

def load_qa_dataset(dataset_name: str, dataset_type: str):
    """
    Load QA dataset based on name
    
    Args:
        dataset_name: One of 'hotpotqa', 'chroniclingamericaqa', or 'natural-questions'
        data_dir: Base directory containing the datasets
    """
    dataset_loaders = {
        'hotpot': lambda: load_hp_data(dataset_type),
        'chroniclingamerica': lambda: load_ca_data(dataset_type),
        'naturalquestions': lambda: load_nq_data(dataset_type)
    }
    
    dataset_name = dataset_name.lower()  # normalize input
    if dataset_name not in dataset_loaders:
        raise ValueError(f"Unknown dataset: {dataset_name}. Available datasets: {list(dataset_loaders.keys())}")
    
    return dataset_loaders[dataset_name]()

In [32]:


dev_data = load_qa_dataset("chroniclingamerica", dataset)

In [54]:
def standardize_keys(example, dataset_name):
    """
    Standardize the keys of a QA dataset example.
    
    Args:
        example (dict): A QA dataset example
        dataset_type (str): Type of the dataset (e.g., 'squad', 'nq', 'triviaqa')
    
    Returns:
        dict: Standardized example with keys 'title', 'context', 'question', 'answers'
    """
    if dataset_name == 'chroniclingamerica':
        return {
            'x_id': example['query_id'],
            'question': example['question'],
            'answer': example['answer'],
            'context': example['context']
        }
   
    else:
        raise ValueError(f"Unsupported dataset type: {dataset_name}")

In [None]:
def process_dataset(dataset_name: str, dataset_type: str):
    """
    Load and standardize a dataset
    """
    
    try:
        data = load_qa_dataset(dataset_name, dataset_type)
        
        if data is None:
            print("Warning: load_qa_dataset returned None")
            return []
            
        if isinstance(data, list):
            if len(data) > 0:
                print(f"First item keys: {data[0].keys()}")
            standardized = [standardize_keys(example, dataset_name) for example in data]
            print(f"Standardized {len(standardized)} examples")
            return standardized
        else:
            return [standardize_keys(example, dataset_name) for example in data]
            
    except Exception as e:
        print(f"Error in process_dataset: {e}")
        raise

# Test it
result = process_dataset('chroniclingamerica', 'dev')
print(f"Final result length: {len(result)}")

In [None]:
test = process_dataset("chroniclingamerica", dataset)