# Data Preparion 
- Join multiple dataset resources into a single dataset.
- New Dataset will be used to fine tune Gemma 3 1B model. 

## Import

In [None]:
from datasets import load_dataset
import os
from abc import ABC, abstractmethod
import pandas as pd
import json
from typing import List, Dict, Any
from collections import Counter
import numpy as np

## Data Download From HuggingFace

In [None]:
# Create directory for HF datasets
os.makedirs('/kaggle/working/hf_datasets', exist_ok=True)

print("Downloading HuggingFace datasets...")

# 1. WritingPrompts
print("\n1/4 WritingPrompts...")
wp = load_dataset("euclaise/writingprompts")
wp.save_to_disk('/kaggle/working/hf_datasets/writingprompts')

# 2. StrategyQA
print("\n2/4 StrategyQA...")
try:
    sqa = load_dataset("wics/strategy-qa", trust_remote_code=True)
except:
    sqa = load_dataset("metaeval/strategy-qa")
sqa.save_to_disk('/kaggle/working/hf_datasets/strategy-qa')

# 3. No Robots
print("\n3/4 No Robots...")
nr = load_dataset("HuggingFaceH4/no_robots")
nr.save_to_disk('/kaggle/working/hf_datasets/no_robots')

# 4. Dolly-15k
print("\n4/4 Dolly-15k...")
dolly = load_dataset("databricks/databricks-dolly-15k")
dolly.save_to_disk('/kaggle/working/hf_datasets/dolly-15k')

print("\nAll HuggingFace datasets downloaded!")

## SETUP

In [None]:
DATASET_CONFIG = {
    "username": "fissalalsharef",
    "dataset_slug": "temporal-flux-calibration-v2",
    
    # Kaggle datasets (accessible via /kaggle/input/)
    "gsm8k": {
        "path": "/kaggle/input/grade-school-math-8k-q-a/main_train.csv",
        "format": "csv",
        "domain": "math",
    },
    "mbpp": {
        "path": "/kaggle/input/mbppjsonl/mbpp.jsonl",
        "format": "jsonl",
        "domain": "coding",
    },
    "sciq": {
        "paths": {
            "train": "/kaggle/input/sciq-a-dataset-for-science-question-answering/train.csv",
            "validation": "/kaggle/input/sciq-a-dataset-for-science-question-answering/validation.csv",
            "test": "/kaggle/input/sciq-a-dataset-for-science-question-answering/test.csv",
        },
        "format": "csv",
        "domain": "science",
    },
    "cnn_dailymail": {
        "paths": {
            "train": "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv",
            "validation": "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv",
            "test": "/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv",
        },
        "format": "csv",
        "domain": "summarization",
        "max_samples": 10000,  # Cap to avoid overloading
    },
    
    # HuggingFace datasets (downloaded to /kaggle/working/hf_datasets/)
    "strategy_qa": {
        "path": "/kaggle/working/hf_datasets/strategy-qa",
        "format": "hf_dataset",
        "domain": "logic",
    },
    "writing_prompts": {
        "path": "/kaggle/working/hf_datasets/writingprompts",
        "format": "hf_dataset",
        "domain": "creative_writing",
    },
    "dolly": {
        "path": "/kaggle/working/hf_datasets/dolly-15k",
        "format": "hf_dataset",
        "domain": "creative_ideation",
        "filter_category": "brainstorming",
    },
    "no_robots": {
        "path": "/kaggle/working/hf_datasets/no_robots",
        "format": "hf_dataset",
        "domain": "creative_ideation",
        "filter_category": "Brainstorm",
    },
}

# Unified schema
UNIFIED_SCHEMA = {
    "domain": "Domain category",
    "prompt": "The task/question",
    "answer": "Ground truth response",
    "metadata": "Optional source-specific data"
}

# Domain list
DOMAINS = [
    "math",
    "coding", 
    "science",
    "summarization",
    "logic",
    "creative_writing",
    "creative_ideation"
]

In [None]:
def to_json_serializable(obj: Any) -> Any:
    """Convert numpy/pandas objects to JSON-serializable Python types"""
    if isinstance(obj, (np.ndarray, pd.Series)):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    elif isinstance(obj, dict):
        return {k: to_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [to_json_serializable(item) for item in obj]
    elif pd.isna(obj):
        return None
    else:
        return obj

## Base Class Adapter

In [None]:
class DatasetAdapter(ABC):
    """Base adapter for all datasets"""
    
    def __init__(self, domain: str, source_name: str):
        self.domain = domain
        self.source_name = source_name
    
    @abstractmethod
    def load(self) -> pd.DataFrame:
        """Load ALL raw data (merge all splits if needed)"""
        pass
    
    @abstractmethod
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform to unified schema"""
        pass
    
    def validate(self, data: List[Dict]) -> List[Dict]:
        """Validate required fields and quality"""
        validated = []
        for item in data:
            # Check required fields
            if not all(k in item for k in ['domain', 'prompt', 'answer']):
                continue
            
            # Check prompt length (>5 words)
            if len(item['prompt'].split()) < 5:
                continue
            
            # Check answer not empty
            if len(item['answer'].strip()) == 0:
                continue
            
            validated.append(item)
        
        return validated
    
    def process(self) -> List[Dict]:
        """Full pipeline: load â†’ transform â†’ validate"""
        print(f"\n{'='*50}")
        print(f"Processing {self.source_name}...")
        print(f"{'='*50}")
        
        # Load
        print("  [1/3] Loading raw data...")
        df = self.load()
        print(f"        Loaded {len(df)} rows")
        
        # Transform
        print("  [2/3] Transforming to unified schema...")
        transformed = self.transform(df)
        print(f"        Transformed {len(transformed)} samples")
        
        # Validate
        print("  [3/3] Validating...")
        validated = self.validate(transformed)
        print(f"        Validated {len(validated)} samples")
        
        return validated

In [None]:
class GSM8KAdapter(DatasetAdapter):
    """Adapter for GSM8K math dataset from Kaggle"""
    
    def __init__(self):
        config = DATASET_CONFIG["gsm8k"]
        super().__init__(domain=config["domain"], source_name="GSM8K")
        self.path = config["path"]
    
    def load(self) -> pd.DataFrame:
        """Load GSM8K dataset (single file, no splits)"""
        return pd.read_csv(self.path)
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform GSM8K to unified schema"""
        data = []
        for _, row in df.iterrows():
            data.append({
                "domain": self.domain,
                "prompt": row['question'],
                "answer": str(row['answer']),
                "metadata": {
                    "source": self.source_name,
                }
            })
        return data


# ============================================================
# CONCRETE ADAPTER: MBPP (Coding)
# ============================================================

class MBPPAdapter(DatasetAdapter):
    """Adapter for MBPP coding dataset from Kaggle"""
    
    def __init__(self):
        config = DATASET_CONFIG["mbpp"]
        super().__init__(domain=config["domain"], source_name="MBPP")
        self.path = config["path"]
    
    def load(self) -> pd.DataFrame:
        """Load MBPP JSONL file"""
        data = []
        with open(self.path) as f:
            for line in f:
                data.append(json.loads(line))
        return pd.DataFrame(data)
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform MBPP to unified schema"""
        data = []
        for _, row in df.iterrows():
            data.append({
                "domain": self.domain,
                "prompt": row['text'],
                "answer": row['code'],
                "metadata": {
                    "source": self.source_name,
                    "test_cases": row.get('test_list', []),
                }
            })
        return data


# ============================================================
# CONCRETE ADAPTER: SciQ (Science)
# ============================================================

class SciQAdapter(DatasetAdapter):
    """Adapter for SciQ science dataset from Kaggle"""
    
    def __init__(self):
        config = DATASET_CONFIG["sciq"]
        super().__init__(domain=config["domain"], source_name="SciQ")
        self.paths = config["paths"]
    
    def load(self) -> pd.DataFrame:
        """Load and merge all SciQ splits"""
        train = pd.read_csv(self.paths["train"])
        val = pd.read_csv(self.paths["validation"])
        test = pd.read_csv(self.paths["test"])
        return pd.concat([train, val, test], ignore_index=True)
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform SciQ to unified schema"""
        data = []
        for _, row in df.iterrows():
            data.append({
                "domain": self.domain,
                "prompt": row['question'],
                "answer": row['correct_answer'],
                "metadata": {
                    "source": self.source_name,
                    "support": row.get('support', ''),
                }
            })
        return data


# ============================================================
# CONCRETE ADAPTER: CNN/DailyMail (Summarization)
# ============================================================

class CNNDMAdapter(DatasetAdapter):
    """Adapter for CNN/DailyMail summarization dataset from Kaggle"""
    
    def __init__(self):
        config = DATASET_CONFIG["cnn_dailymail"]
        super().__init__(domain=config["domain"], source_name="CNN/DailyMail")
        self.paths = config["paths"]
        self.max_samples = config.get("max_samples", None)
    
    def load(self) -> pd.DataFrame:
        """Load and merge all CNN/DM splits, with optional capping"""
        train = pd.read_csv(self.paths["train"])
        val = pd.read_csv(self.paths["validation"])
        test = pd.read_csv(self.paths["test"])
        merged = pd.concat([train, val, test], ignore_index=True)
        
        # Cap if specified (summarization dataset is huge)
        if self.max_samples and len(merged) > self.max_samples:
            merged = merged.sample(self.max_samples, random_state=42)
        
        return merged
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform CNN/DM to unified schema"""
        data = []
        for _, row in df.iterrows():
            data.append({
                "domain": self.domain,
                "prompt": f"Summarize this article:\n\n{row['article']}",
                "answer": row['highlights'],
                "metadata": {
                    "source": self.source_name,
                }
            })
        return data


# ============================================================
# CONCRETE ADAPTER: StrategyQA (Logic)
# ============================================================

class StrategyQAAdapter(DatasetAdapter):
    """Adapter for StrategyQA logic dataset from HuggingFace"""
    
    def __init__(self):
        config = DATASET_CONFIG["strategy_qa"]
        super().__init__(domain=config["domain"], source_name="StrategyQA")
        self.path = config["path"]
    
    def load(self) -> pd.DataFrame:
        """Load StrategyQA from disk"""
        from datasets import load_from_disk
        dataset = load_from_disk(self.path)
        # Merge all splits if multiple exist
        if hasattr(dataset, 'keys'):
            all_data = []
            for split in dataset.keys():
                all_data.append(dataset[split].to_pandas())
            return pd.concat(all_data, ignore_index=True)
        return dataset.to_pandas()
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform StrategyQA to unified schema"""
        data = []
        for _, row in df.iterrows():
            # Convert boolean to Yes/No
            answer = "Yes" if row.get('answer', False) else "No"
            
            data.append({
                "domain": self.domain,
                "prompt": row['question'],
                "answer": answer,
                "metadata": {
                    "source": self.source_name,
                    "decomposition": row.get('decomposition', []),
                    "facts": row.get('facts', []),
                }
            })
        return data


# ============================================================
# CONCRETE ADAPTER: WritingPrompts (Creative Writing)
# ============================================================

class WritingPromptsAdapter(DatasetAdapter):
    """Adapter for WritingPrompts creative writing dataset from HuggingFace"""
    
    def __init__(self):
        config = DATASET_CONFIG["writing_prompts"]
        super().__init__(domain=config["domain"], source_name="WritingPrompts")
        self.path = config["path"]
        self.max_samples = 6000  # ADD THIS - cap to 6K
    
    def load(self) -> pd.DataFrame:
        """Load WritingPrompts from disk"""
        from datasets import load_from_disk
        dataset = load_from_disk(self.path)
        # Usually has 'train' split
        if hasattr(dataset, 'keys'):
            df = dataset['train'].to_pandas()
        else:
            df = dataset.to_pandas()
        
        # ADD THIS - Cap if too large
        if self.max_samples and len(df) > self.max_samples:
            df = df.sample(self.max_samples, random_state=42)
        
        return df
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform WritingPrompts to unified schema"""
        data = []
        for _, row in df.iterrows():
            # Remove [WP], [TT], etc tags from prompt
            prompt = row['prompt']
            for tag in ['[WP]', '[TT]', '[FF]', '[EU]', '[PI]']:
                prompt = prompt.replace(tag, '').strip()
            
            data.append({
                "domain": self.domain,
                "prompt": prompt,
                "answer": row['story'],
                "metadata": {
                    "source": self.source_name,
                }
            })
        return data


# ============================================================
# CONCRETE ADAPTER: Dolly (Creative Ideation)
# ============================================================

class DollyAdapter(DatasetAdapter):
    """Adapter for Dolly brainstorming dataset from HuggingFace"""
    
    def __init__(self):
        config = DATASET_CONFIG["dolly"]
        super().__init__(domain=config["domain"], source_name="Dolly-15k")
        self.path = config["path"]
        self.filter_category = config.get("filter_category")
    
    def load(self) -> pd.DataFrame:
        """Load Dolly and filter to brainstorming category"""
        from datasets import load_from_disk
        dataset = load_from_disk(self.path)
        df = dataset['train'].to_pandas() if hasattr(dataset, 'keys') else dataset.to_pandas()
        
        # Filter to brainstorming only
        if self.filter_category and 'category' in df.columns:
            df = df[df['category'] == self.filter_category]
        
        return df
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform Dolly to unified schema"""
        data = []
        for _, row in df.iterrows():
            data.append({
                "domain": self.domain,
                "prompt": row['instruction'],
                "answer": row['response'],
                "metadata": {
                    "source": self.source_name,
                    "category": row.get('category', ''),
                    "context": row.get('context', ''),
                }
            })
        return data


# ============================================================
# CONCRETE ADAPTER: No Robots (Creative Ideation)
# ============================================================

class NoRobotsAdapter(DatasetAdapter):
    """Adapter for No Robots brainstorming dataset from HuggingFace"""
    
    def __init__(self):
        config = DATASET_CONFIG["no_robots"]
        super().__init__(domain=config["domain"], source_name="NoRobots")
        self.path = config["path"]
        self.filter_category = config.get("filter_category")
    
    def load(self) -> pd.DataFrame:
        """Load No Robots and filter to brainstorming category"""
        from datasets import load_from_disk
        dataset = load_from_disk(self.path)
        
        # Merge train_sft and test_sft
        all_data = []
        for split in dataset.keys():
            all_data.append(dataset[split].to_pandas())
        df = pd.concat(all_data, ignore_index=True)
        
        # Filter to brainstorming only
        if self.filter_category and 'category' in df.columns:
            df = df[df['category'] == self.filter_category]
        
        return df
    
    def transform(self, df: pd.DataFrame) -> List[Dict]:
        """Transform No Robots to unified schema"""
        data = []
        for _, row in df.iterrows():
            # Extract user prompt from messages
            prompt = str(row['prompt'])
            
            # Extract assistant response from messages
            messages = row.get('messages', [])
            answer = ""
            
            # Handle messages - convert to list if numpy array
            if hasattr(messages, 'tolist'):
                messages = messages.tolist()
            
            # Ensure messages is iterable
            if isinstance(messages, (list, tuple)):
                for msg in messages:
                    if isinstance(msg, dict) and msg.get('role') == 'assistant':
                        answer = str(msg.get('content', ''))
                        break
            
            data.append({
                "domain": self.domain,
                "prompt": prompt,
                "answer": answer,
                "metadata": {
                    "source": self.source_name,
                    "category": str(row.get('category', '')),
                }
            })
        return data


## Data Mixer

In [None]:
class DatasetMixer:
    """Combines all dataset adapters into full pool"""
    
    def __init__(self):
        self.adapters = [
            GSM8KAdapter(),
            MBPPAdapter(),
            SciQAdapter(),
            CNNDMAdapter(),
            StrategyQAAdapter(),
            WritingPromptsAdapter(),
            DollyAdapter(),
            NoRobotsAdapter(),
        ]
    
    def create_full_pool(self) -> List[Dict]:
        """Process all adapters and combine into single pool"""
        print("\n" + "="*60)
        print("CREATING FULL DATASET POOL")
        print("="*60)
        
        all_data = []
        
        for adapter in self.adapters:
            try:
                data = adapter.process()
                all_data.extend(data)
            except Exception as e:
                print(f"\nERROR processing {adapter.source_name}: {e}")
                continue
        
        # Shuffle
        import random
        random.seed(42)
        random.shuffle(all_data)
        
        return all_data
    
    def save_pool(self, data: List[Dict], output_path: str):
        """Save pool as JSONL"""
        with open(output_path, 'w') as f:
            for item in data:
                # Convert numpy/pandas objects to JSON-serializable types
                serializable_item = to_json_serializable(item)
                f.write(json.dumps(serializable_item) + '\n')
        
        print(f"\n{'='*60}")
        print(f"SAVED POOL")
        print(f"{'='*60}")
        print(f"Location: {output_path}")
        print(f"Total samples: {len(data)}")
        
        # Print domain distribution
        domain_counts = Counter(item['domain'] for item in data)
        print(f"\nðŸ“Š Domain Distribution:")
        for domain in sorted(domain_counts.keys()):
            count = domain_counts[domain]
            pct = (count / len(data)) * 100
            print(f"  {domain:20s}: {count:5d} ({pct:5.1f}%)")

## Join Datasets

In [None]:
print("TESTING FULL DATASET PIPELINE - ALL 8 ADAPTERS")

# Create mixer
mixer = DatasetMixer()

# Create full pool
full_pool = mixer.create_full_pool()

## Save Dataset

In [None]:
# Save
output_path = '/kaggle/working/full_dataset_pool.jsonl'
mixer.save_pool(full_pool, output_path)

# Save sample for inspection
sample_output = full_pool[:20]
with open('/kaggle/working/sample_pool.jsonl', 'w') as f:
    for item in sample_output:
        serializable_item = to_json_serializable(item)
        f.write(json.dumps(serializable_item) + '\n')

print(f"\nSaved 20 samples to /kaggle/working/sample_pool.jsonl")
print(f"\nPIPELINE COMPLETE!")

## Testing & Validation

In [None]:
import json
metadata = {
    "title": "Temporal Flux Calibration Dataset v2",
    "id": "fissalalsharef/temporal-flux-calibration-v2",
    "licenses": [{"name": "CC0-1.0"}]
}
with open('/kaggle/working/dataset-metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("Metadata file created")

In [None]:
# Find the truncated question in DataFrame if any.
df[df['prompt'].str.contains('Hani said she would do 3 more situps per', na=False)]['prompt'].values[0]