In [1]:
from datasets import load_dataset
import os
from abc import ABC, abstractmethod
import pandas as pd
import json
from typing import List, Dict, Any
from collections import Counter
import numpy as np
import os
import json
import random
from abc import ABC, abstractmethod
from typing import List, Dict, Any
from collections import Counter
import pandas as pd
import numpy as np
from datasets import load_from_disk

In [4]:
import os
from datasets import load_dataset

# Create directory for HF datasets
os.makedirs('/kaggle/working/hf_datasets', exist_ok=True)

# 1. Infinite Chats (Conversation/Unverifiable)
print("1/4 Downloading infinite-chats-eval...")
ic = load_dataset("liweijiang/infinite-chats-eval")
ic.save_to_disk('/kaggle/working/hf_datasets/infinite-chats-eval')

# 2. GovReport (Summarization)
print("\n2/4 Downloading govreport-summarization...")
gr = load_dataset("ccdv/govreport-summarization")
gr.save_to_disk('/kaggle/working/hf_datasets/govreport-summarization')

# 3. CNN/DailyMail (Summarization) - replacing XSum
print("\n3/4 Downloading cnn_dailymail...")
cnn = load_dataset("cnn_dailymail", "3.0.0", trust_remote_code=True)
cnn.save_to_disk('/kaggle/working/hf_datasets/cnn_dailymail')

# 4. WritingPrompts (Creative Writing)
print("\n4/4 Downloading writingprompts...")
wp = load_dataset("euclaise/writingprompts")
wp.save_to_disk('/kaggle/working/hf_datasets/writingprompts')

print("\nâœ… All unverifiable domain datasets downloaded!")

1/4 Downloading infinite-chats-eval...


Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]


2/4 Downloading govreport-summarization...


Saving the dataset (0/2 shards):   0%|          | 0/17517 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/973 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/973 [00:00<?, ? examples/s]


4/4 Downloading writingprompts...


README.md:   0%|          | 0.00/837 [00:00<?, ?B/s]

data/train-00000-of-00002-105e07cb0d1994(â€¦):   0%|          | 0.00/272M [00:00<?, ?B/s]

data/train-00001-of-00002-4fdb982c110564(â€¦):   0%|          | 0.00/272M [00:00<?, ?B/s]

data/test-00000-of-00001-16503b0c26ed00c(â€¦):   0%|          | 0.00/30.0M [00:00<?, ?B/s]

data/validation-00000-of-00001-137b93e1e(â€¦):   0%|          | 0.00/30.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/272600 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15138 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15620 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/272600 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15138 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15620 [00:00<?, ? examples/s]


âœ… All unverifiable domain datasets downloaded!


In [7]:
# ==================== CONFIGURATION ====================

# Output path
OUTPUT_PATH = "/kaggle/working/unverifiable_test_dataset.jsonl"

# Sample sizes per domain
SAMPLES_PER_DOMAIN = 100

# Domains
DOMAINS = [
    "creative_writing",
    "summarization", 
    "conversation",
]

In [8]:
# ==================== UTILITIES ====================

def to_json_serializable(obj: Any) -> Any:
    """Convert numpy/pandas objects to JSON-serializable Python types"""
    if isinstance(obj, (np.ndarray, pd.Series)):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    elif isinstance(obj, dict):
        return {k: to_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [to_json_serializable(item) for item in obj]
    elif pd.isna(obj):
        return None
    else:
        return obj

In [11]:
# ==================== BASE ADAPTER ====================

class DatasetAdapter(ABC):
    """Base adapter for all datasets"""
    
    def __init__(self, domain: str, source_name: str):
        self.domain = domain
        self.source_name = source_name
    
    @abstractmethod
    def load(self) -> pd.DataFrame:
        """Load raw data"""
        pass
    
    @abstractmethod
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform to unified schema"""
        pass
    
    def validate(self, data: List[Dict]) -> List[Dict]:
        """Validate required fields and quality"""
        validated = []
        for item in data:
            # Check required fields
            if not all(k in item for k in ['domain', 'prompt']):
                continue
            
            # Check prompt length (>5 words)
            if len(item['prompt'].split()) < 5:
                continue
            
            validated.append(item)
        
        return validated
    
    def process(self, max_samples: int = None) -> List[Dict]:
        """Full pipeline: load â†’ transform â†’ validate â†’ sample"""
        print(f"\n{'='*50}")
        print(f"Processing {self.source_name}...")
        print(f"{'='*50}")
        
        try:
            # Load
            print("  [1/4] Loading raw data...")
            df = self.load()
            print(f"        Loaded {len(df)} rows")
            
            # Transform
            print("  [2/4] Transforming to unified schema...")
            transformed = self.transform(df)
            print(f"        Transformed {len(transformed)} samples")
            
            # Validate
            print("  [3/4] Validating...")
            validated = self.validate(transformed)
            print(f"        Validated {len(validated)} samples")
            
            # Sample if needed
            if max_samples and len(validated) > max_samples:
                print(f"  [4/4] Sampling {max_samples} from {len(validated)}...")
                validated = random.sample(validated, max_samples)
            
            return validated
            
        except Exception as e:
            print(f"  ERROR: {e}")
            import traceback
            traceback.print_exc()
            return []


# ==================== CONCRETE ADAPTERS ====================

class InfiniteChatsAdapter(DatasetAdapter):
    """Adapter for liweijiang/infinite-chats-eval
    
    Format: Single column with conversation/chat data for unverifiable evaluation.
    We use each sample as a conversational prompt.
    """
    
    def __init__(self):
        super().__init__(domain="conversation", source_name="Infinite-Chats-Eval")
    
    def load(self) -> pd.DataFrame:
        from datasets import load_dataset
        dataset = load_dataset("liweijiang/infinite-chats-eval", split="train")
        return dataset.to_pandas()
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        data = []
        
        # Get the first column name (single column dataset)
        col_name = df.columns[0]
        
        for _, row in df.iterrows():
            content = str(row[col_name]).strip()
            
            if len(content) > 20:  # Minimum length
                # Use the content as a conversation prompt
                data.append({
                    "domain": self.domain,
                    "prompt": content[:2000],  # Truncate if too long
                    "answer": "",  # No ground truth for unverifiable
                    "metadata": {"source": self.source_name}
                })
        
        return data


class GovReportAdapter(DatasetAdapter):
    """Adapter for ccdv/govreport-summarization
    
    Format: Has 'report' (full text) and 'summary' columns.
    We use the summary and ask to summarize it further (meta-summarization).
    """
    
    def __init__(self):
        super().__init__(domain="summarization", source_name="GovReport")
    
    def load(self) -> pd.DataFrame:
        from datasets import load_dataset
        dataset = load_dataset("ccdv/govreport-summarization", split="test")
        return dataset.to_pandas()
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        data = []
        
        for _, row in df.iterrows():
            summary = row.get('summary', '')
            
            if summary and len(summary) > 100:
                # Truncate very long summaries
                summary_text = str(summary)[:2000]
                
                prompt = f"Summarize the following text in 2-3 sentences:\n\n{summary_text}"
                
                data.append({
                    "domain": self.domain,
                    "prompt": prompt,
                    "answer": "",  # No ground truth for meta-summary
                    "metadata": {"source": self.source_name}
                })
        
        return data


class CNNDailyMailAdapter(DatasetAdapter):
    """Adapter for cnn_dailymail (replacing XSum which has deprecated scripts)
    
    Format: Has 'article' and 'highlights' columns.
    We use the article and ask for a summary.
    """
    
    def __init__(self):
        super().__init__(domain="summarization", source_name="CNN-DailyMail")
    
    def load(self) -> pd.DataFrame:
        from datasets import load_dataset
        dataset = load_dataset("cnn_dailymail", "3.0.0", split="test", trust_remote_code=True)
        return dataset.to_pandas()
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        data = []
        
        for _, row in df.iterrows():
            article = row.get('article', '')
            highlights = row.get('highlights', '')
            
            if article and len(article) > 100:
                # Truncate long articles
                article_text = str(article)[:1500]
                
                prompt = f"Provide a brief summary of the following news article:\n\n{article_text}"
                
                data.append({
                    "domain": self.domain,
                    "prompt": prompt,
                    "answer": highlights,  # Original highlights as reference
                    "metadata": {"source": self.source_name}
                })
        
        return data


class WritingPromptsAdapter(DatasetAdapter):
    """Adapter for euclaise/writingprompts
    
    Format: Has prompts for creative story writing.
    """
    
    def __init__(self):
        super().__init__(domain="creative_writing", source_name="WritingPrompts")
    
    def load(self) -> pd.DataFrame:
        from datasets import load_dataset
        dataset = load_dataset("euclaise/writingprompts", split="train")
        return dataset.to_pandas()
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        data = []
        
        # Check column names
        prompt_col = 'prompt' if 'prompt' in df.columns else df.columns[0]
        story_col = 'story' if 'story' in df.columns else (df.columns[1] if len(df.columns) > 1 else None)
        
        for _, row in df.iterrows():
            prompt = str(row[prompt_col]).strip()
            
            # Clean common prefixes from writing prompts
            for prefix in ['[WP]', '[SP]', '[EU]', '[CW]', '[RF]', '[TT]']:
                prompt = prompt.replace(prefix, '').strip()
            
            if len(prompt) > 20:
                full_prompt = f"Write a short story based on this prompt:\n\n{prompt}"
                
                data.append({
                    "domain": self.domain,
                    "prompt": full_prompt,
                    "answer": str(row[story_col])[:500] if story_col else "",  # Sample of story if available
                    "metadata": {"source": self.source_name}
                })
        
        return data


# ==================== DATASET MIXER ====================

class UnverifiableDomainMixer:
    """Combines all adapters into unverifiable domain test dataset"""
    
    def __init__(self):
        self.adapters = [
            InfiniteChatsAdapter(),
            GovReportAdapter(),
            CNNDailyMailAdapter(),
            WritingPromptsAdapter(),
        ]
    
    def create_dataset(self, samples_per_domain: int = 100) -> List[Dict]:
        """Create balanced unverifiable domain dataset"""
        print("\n" + "="*60)
        print("CREATING UNVERIFIABLE DOMAINS DATASET")
        print("="*60)
        
        all_data = []
        
        # Process each adapter
        for adapter in self.adapters:
            data = adapter.process(max_samples=samples_per_domain)
            all_data.extend(data)
        
        # Shuffle
        random.seed(42)
        random.shuffle(all_data)
        
        return all_data
    
    def save_dataset(self, data: List[Dict], output_path: str):
        """Save dataset as JSONL"""
        with open(output_path, 'w') as f:
            for item in data:
                serializable_item = to_json_serializable(item)
                f.write(json.dumps(serializable_item) + '\n')
        
        print(f"\n{'='*60}")
        print(f"SAVED UNVERIFIABLE DOMAINS DATASET")
        print(f"{'='*60}")
        print(f"Location: {output_path}")
        print(f"Total samples: {len(data)}")
        
        # Print domain distribution
        domain_counts = Counter(item['domain'] for item in data)
        print(f"\nðŸ“Š Domain Distribution:")
        for domain in sorted(domain_counts.keys()):
            count = domain_counts[domain]
            pct = (count / len(data)) * 100
            print(f"  {domain:20s}: {count:5d} ({pct:5.1f}%)")


In [12]:
print("="*60)
print("UNVERIFIABLE DOMAINS DATASET CURATION")
print("="*60)

# Create mixer
mixer = UnverifiableDomainMixer()

# Create dataset
data = mixer.create_dataset(samples_per_domain=SAMPLES_PER_DOMAIN)

# Save
mixer.save_dataset(data, OUTPUT_PATH)

# Save sample for inspection
sample_path = OUTPUT_PATH.replace('.jsonl', '_sample.jsonl')
with open(sample_path, 'w') as f:
    for item in data[:20]:
        f.write(json.dumps(to_json_serializable(item)) + '\n')

print(f"\nSaved 20 samples to {sample_path}")
print("\nâœ… UNVERIFIABLE DOMAINS DATASET CREATION COMPLETE!")

UNVERIFIABLE DOMAINS DATASET CURATION

CREATING UNVERIFIABLE DOMAINS DATASET

Processing Infinite-Chats-Eval...
  [1/4] Loading raw data...
        Loaded 100 rows
  [2/4] Transforming to unified schema...
        Transformed 100 samples
  [3/4] Validating...
        Validated 100 samples

Processing GovReport...
  [1/4] Loading raw data...


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'cnn_dailymail' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


        Loaded 973 rows
  [2/4] Transforming to unified schema...
        Transformed 973 samples
  [3/4] Validating...
        Validated 973 samples
  [4/4] Sampling 100 from 973...

Processing CNN-DailyMail...
  [1/4] Loading raw data...
        Loaded 11490 rows
  [2/4] Transforming to unified schema...
        Transformed 11490 samples
  [3/4] Validating...
        Validated 11490 samples
  [4/4] Sampling 100 from 11490...

Processing WritingPrompts...
  [1/4] Loading raw data...
        Loaded 272600 rows
  [2/4] Transforming to unified schema...
        Transformed 268807 samples
  [3/4] Validating...
        Validated 268807 samples
  [4/4] Sampling 100 from 268807...

SAVED UNVERIFIABLE DOMAINS DATASET
Location: /kaggle/working/unverifiable_test_dataset.jsonl
Total samples: 400

ðŸ“Š Domain Distribution:
  conversation        :   100 ( 25.0%)
  creative_writing    :   100 ( 25.0%)
  summarization       :   200 ( 50.0%)

Saved 20 samples to /kaggle/working/unverifiable_test_data