# Generating Human text data from Wikipedia

In [None]:
import requests
import time
import json
import re
from typing import List, Dict, Optional, Set
from datetime import datetime

# 1. WIKIPEDIA API COLLECTOR

class WikipediaCollector:

    def __init__(self):
        self.base_url = "https://en.wikipedia.org/w/api.php"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'AI-Detection-Research/1.0 (Educational Purpose)'
        })

    def get_articles_by_category(self, category: str,
                                 num_articles: int = 50,
                                 continue_token: Optional[str] = None) -> tuple[List[str], Optional[str]]:
        """
        Get article titles from a specific Wikipedia category
        Returns: (titles, next_continue_token)
        """
        titles = []

        params = {
            'action': 'query',
            'format': 'json',
            'list': 'categorymembers',
            'cmtitle': f'Category:{category}',
            'cmlimit': min(num_articles, 50),  # API max is 50
            'cmtype': 'page'
        }

        if continue_token:
            params['cmcontinue'] = continue_token

        try:
            response = self.session.get(self.base_url, params=params)
            data = response.json()

            for page in data['query']['categorymembers']:
                titles.append(page['title'])

            # Get continuation token if available
            next_token = None
            if 'continue' in data:
                next_token = data['continue']['cmcontinue']

            print(f"Fetched {len(titles)} articles from '{category}'...")
            time.sleep(0.5)

            return titles, next_token

        except Exception as e:
            print(f"Error getting category articles: {e}")
            return [], None

    def get_article_content(self, title: str,
                           extract_sections: bool = True) -> Optional[Dict]:
        """
        Get the full content of a Wikipedia article
        """
        params = {
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts|info',
            'explaintext': True,
            'exsectionformat': 'plain' if extract_sections else 'raw',
            'inprop': 'url'
        }

        try:
            response = self.session.get(self.base_url, params=params)
            data = response.json()

            pages = data['query']['pages']
            page_id = list(pages.keys())[0]

            if page_id == '-1':
                print(f"Article '{title}' not found")
                return None

            page = pages[page_id]

            return {
                'title': page.get('title'),
                'content': page.get('extract', ''),
                'url': page.get('fullurl', ''),
                'page_id': page_id
            }

        except Exception as e:
            print(f"Error getting article '{title}': {e}")
            return None


# 2. TEXT PROCESSING AND CHUNKING


def clean_wikipedia_text(text: str) -> str:
    """
    Clean Wikipedia text to remove metadata and formatting
    """
    if not text:
        return ""

    sections_to_remove = [
        r'\n==\s*See also\s*==.*',
        r'\n==\s*References\s*==.*',
        r'\n==\s*External links\s*==.*',
        r'\n==\s*Notes\s*==.*',
        r'\n==\s*Further reading\s*==.*'
    ]

    for pattern in sections_to_remove:
        text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)

    text = re.sub(r'\n=+\s*.*?\s*=+\n', '\n', text)
    text = re.sub(r'\[\d+\]|\[citation needed\]|\[clarification needed\]', '', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r' +', ' ', text)

    return text.strip()


def chunk_text(text: str,
               chunk_size: int = 200,
               min_words: int = 80,
               overlap: int = 40) -> List[str]:
    words = text.split()
    chunks = []

    for start in range(0, len(words), chunk_size - overlap):
        chunk = words[start:start + chunk_size]
        if len(chunk) >= min_words:
            chunks.append(" ".join(chunk))

    return chunks


# 3. DATASET CREATION WITH DEDUPLICATION


def collect_wikipedia_dataset(num_samples: int = 2000,
                              domains: Optional[List[str]] = None,
                              target_words: int = 300,
                              overlap: int = 40) -> List[Dict]:
    """
    Collect Wikipedia dataset from specific categories with deduplication

    Args:
        num_samples: Number of text samples to collect
        domains: List of Wikipedia categories to collect from
        target_words: Target word count per sample
        overlap: Word overlap between chunks (set to 0 to avoid chunk overlap)
    """

    collector = WikipediaCollector()
    samples = []

    # Track processed articles to avoid duplication
    processed_articles: Set[str] = set()

    # Track continuation tokens for each domain
    domain_tokens: Dict[str, Optional[str]] = {}

    if domains is None:
        domains = [
            "Science", "History", "Technology", "Biology", "Physics",
            'Geography', 'Literature', 'Philosophy', "Mathematics",
            "Computer Science", "Medicine", "Economics", "Psychology",
            'climate change', 'world war', 'constitution',
            'quantum mechanics', 'human evolution', 'renewable energy',
            'ancient civilization', 'modern art', 'space exploration',
            "Artificial Intelligence", "The Industrial Revolution", "Quantum Mechanics",
            "AStronomy", "Environmental Science", "Machine Learning",
            "Renewable Energy", "Genetics", "Space Exploration", "Blockchain","Animals and Zoology",
            "Engineering","Education"
        ]

    print(f"Starting Wikipedia collection: {num_samples} samples")
    print(f"Method: Category-based with deduplication")
    print(f"Domains: {len(domains)} categories")
    print(f"Overlap: {overlap} words")
    print("=" * 60)

    domain_index = 0
    max_attempts = num_samples * 3  # Safety limit
    attempts = 0

    while len(samples) < num_samples and attempts < max_attempts:
        domain = domains[domain_index % len(domains)]

        # Get continuation token for this domain
        continue_token = domain_tokens.get(domain)

        print(f"\nCollecting from category: {domain} (attempt {attempts + 1})")

        titles, next_token = collector.get_articles_by_category(
            domain,
            num_articles=50,
            continue_token=continue_token
        )

        # Update continuation token
        domain_tokens[domain] = next_token

        # If no new articles from this domain, move to next
        if not titles:
            print(f"No more articles in '{domain}', moving to next domain...")
            domain_index += 1
            continue

        new_articles_processed = 0

        for title in titles:
            if len(samples) >= num_samples:
                break

            # Skip if already processed
            if title in processed_articles:
                print(f"Skipping duplicate: {title}")
                continue

            processed_articles.add(title)

            article = collector.get_article_content(title)
            if not article or not article['content']:
                continue

            processed_samples = process_article(
                article,
                domain=domain.lower(),
                target_words=target_words,
                overlap=overlap
            )

            if not processed_samples:
                continue

            samples.extend(processed_samples)
            new_articles_processed += 1

            if len(samples) >= num_samples:
                break

            time.sleep(0.3)

        print(f"New articles processed: {new_articles_processed}")
        print(f"Total samples collected: {len(samples)}/{num_samples}")
        print(f"Unique articles processed: {len(processed_articles)}")

        attempts += 1
        domain_index += 1

    # Trim to exact number requested
    samples = samples[:num_samples]

    print(f"\n{'=' * 60}")
    print(f"✓ Collection complete!")
    print(f"Total samples collected: {len(samples)}")
    print(f"Unique articles used: {len(processed_articles)}")
    print(f"Average chunks per article: {len(samples) / len(processed_articles):.2f}")

    return samples


def process_article(article: Dict, domain: str,
                   target_words: int = 300,
                   overlap: int = 40) -> List[Dict]:
    """
    Process a single Wikipedia article into multiple samples
    """

    cleaned_text = clean_wikipedia_text(article['content'])
    word_count = len(cleaned_text.split())

    # Skip very short / stub articles
    if word_count < 800:
        return []

    # Chunk the full article
    chunks = chunk_text(
        cleaned_text,
        chunk_size=target_words,
        min_words=int(target_words * 0.6),
        overlap=overlap
    )

    samples = []
    for i, text in enumerate(chunks):
        samples.append({
            'id': f"wiki_{article['page_id']}_{i}",
            'text': text,
            'label': 0,
            'domain': domain,
            'source': 'wikipedia',
            'article_title': article['title'],
            'url': article['url'],
            'word_count': len(text.split()),
            'collection_date': datetime.now().isoformat(),
            'modifications': 'none',
            'chunk_index': i
        })

    return samples


# 4. SAVE AND EXPORT

def save_wikipedia_dataset(samples: List[Dict], output_dir: str = 'wikipedia_data'):
    """
    Save collected Wikipedia dataset
    """
    import os
    os.makedirs(output_dir, exist_ok=True)

    # Save as JSON
    with open(f'{output_dir}/wikipedia_samples.json', 'w', encoding='utf-8') as f:
        json.dump(samples, f, indent=2, ensure_ascii=False)

    # Save as CSV for easy viewing
    import pandas as pd
    df = pd.DataFrame(samples)
    df.to_csv(f'{output_dir}/wikipedia_samples.csv', index=False)

    # Generate detailed statistics
    stats = {
        'total_samples': len(samples),
        'unique_articles': len(set(s['article_title'] for s in samples)),
        'avg_word_count': sum(s['word_count'] for s in samples) / len(samples),
        'min_word_count': min(s['word_count'] for s in samples),
        'max_word_count': max(s['word_count'] for s in samples),
        'domains': {},
        'collection_date': datetime.now().isoformat()
    }

    # Count by domain
    for sample in samples:
        domain = sample.get('domain', 'unknown')
        stats['domains'][domain] = stats['domains'].get(domain, 0) + 1

    # Calculate chunks per article
    article_chunks = {}
    for sample in samples:
        article = sample['article_title']
        article_chunks[article] = article_chunks.get(article, 0) + 1

    stats['avg_chunks_per_article'] = sum(article_chunks.values()) / len(article_chunks)
    stats['max_chunks_from_single_article'] = max(article_chunks.values())

    with open(f'{output_dir}/wikipedia_stats.json', 'w') as f:
        json.dump(stats, f, indent=2)

    print(f"\n✓ Dataset saved to {output_dir}/")
    print(f"  - wikipedia_samples.json")
    print(f"  - wikipedia_samples.csv")
    print(f"  - wikipedia_stats.json")

    return df


# 5. EXAMPLE USAGE


if __name__ == "__main__":
    print("Wikipedia Human Text Collector - Category-Based with Deduplication")
    print("=" * 60)

    # Collect 2000 samples from specific categories
    samples = collect_wikipedia_dataset(
        num_samples=2000,
        domains=[
            "Science", "History", "Technology", "Biology", "Physics",
            'Geography', 'Literature', 'Philosophy', "Mathematics",
            "Computer Science", "Medicine", "Economics", "Psychology",
            'climate change', 'world war', 'constitution',
            'quantum mechanics', 'human evolution', 'renewable energy',
            'ancient civilization', 'modern art', 'space exploration',
            "Artificial Intelligence", "The Industrial Revolution", "Quantum Mechanics",
            "AStronomy", "Environmental Science", "Machine Learning",
            "Renewable Energy", "Genetics", "Space Exploration", "Blockchain","Animals and Zoology",
            "Engineering","Education",
        ],
        target_words=300,
        overlap=40  # Set to 0 for no overlap between chunks
    )

    # Save dataset
    df = save_wikipedia_dataset(samples, output_dir='wikipedia_human_texts')

    print("\n" + "=" * 60)
    print("Collection complete!")
    print(f"Total samples: {len(samples)}")
    print(f"Unique articles: {df['article_title'].nunique()}")
    print(f"Average word count: {df['word_count'].mean():.1f}")
    print("\nDomain distribution:")
    print(df['domain'].value_counts())

Wikipedia Human Text Collector - Category-Based with Deduplication
Starting Wikipedia collection: 2000 samples
Method: Category-based with deduplication
Domains: 35 categories
Overlap: 40 words

Collecting from category: Science (attempt 1)
Fetched 4 articles from 'Science'...
New articles processed: 2
Total samples collected: 36/2000
Unique articles processed: 4

Collecting from category: History (attempt 2)
Fetched 13 articles from 'History'...
New articles processed: 11
Total samples collected: 211/2000
Unique articles processed: 17

Collecting from category: Technology (attempt 3)
Fetched 49 articles from 'Technology'...
New articles processed: 17
Total samples collected: 400/2000
Unique articles processed: 66

Collecting from category: Biology (attempt 4)
Fetched 23 articles from 'Biology'...
New articles processed: 9
Total samples collected: 521/2000
Unique articles processed: 89

Collecting from category: Physics (attempt 5)
Fetched 26 articles from 'Physics'...
New articles pro

# Restructuring Data

In [None]:
import json

# Load your original file
with open("/content/wikipedia_human_texts/wikipedia_samples.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Keep only required fields
cleaned_data = [
    {
        "text": sample["text"],
        "label": sample["label"]
    }
    for sample in data
]

# Save cleaned file
with open("wikipedia_samples", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

print(f"Converted {len(cleaned_data)} samples successfully.")

Converted 2000 samples successfully.


# Generating AI text Data Samples

In [1]:
import json
import random
import torch
from transformers import pipeline, set_seed
from tqdm import tqdm
import time
import warnings
import logging

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
logging.getLogger('transformers').setLevel(logging.ERROR)

# --- Configuration ---
MODEL_NAME = "EleutherAI/gpt-neo-1.3B"
NUM_SAMPLES = 2000
TARGET_CHUNK_LENGTH = 300  # Words in final cut
MIN_GENERATION_LENGTH = 350  # Reduced from 450 (you don't need that much)
OUTPUT_FILENAME = "ai_wiki_data.json"
SEED = 42

# OPTIMIZATION: Batch processing
BATCH_SIZE = 4  # Generate 4 samples at once (adjust based on GPU memory)

# Set Reproducibility
random.seed(SEED)
torch.manual_seed(SEED)
set_seed(SEED)

# Check for GPU
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU (CUDA)' if device == 0 else 'CPU'}")

if device == 0:
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# --- 1. Initialize Model ---
print(f"\nLoading {MODEL_NAME}...")
start_time = time.time()

# OPTIMIZATION: Enable better settings for speed
generator = pipeline(
    'text-generation',
    model=MODEL_NAME,
    device=device,
    torch_dtype=torch.float16 if device == 0 else torch.float32,  # Half precision on GPU
    batch_size=BATCH_SIZE  # Enable batch processing
)

# CRITICAL FIX: Set pad_token for batching
generator.tokenizer.pad_token_id = generator.model.config.eos_token_id
generator.tokenizer.padding_side = "left"  # Padding on left for generation

load_time = time.time() - start_time
print(f"✓ Model loaded in {load_time:.2f} seconds")
print(f"✓ Tokenizer configured for batching (pad_token_id: {generator.tokenizer.pad_token_id})")

# --- 2. Define Assets ---
TOPICS = ["Artificial Intelligence", "The Industrial Revolution", "Quantum Mechanics",
          "Photosynthesis", "The History of the Internet", "Machine Learning",
          "Renewable Energy", "Genetics", "Space Exploration", "Blockchain"
         ]

TEMPLATES = [

    "Present an encyclopedic article on {topic}. Cover definitions, important theories, and scholarly perspectives.",
    "Write a comprehensive Wikipedia-style encyclopedia entry about {topic}. Focus on its definition, history, and core principles. Use a neutral, formal tone.",
    "Provide a detailed academic overview of {topic}. Explain the fundamental principles and major developments in this field.",
]

# --- 3. Helper Functions ---

def get_random_chunk(text, target_words=300):
    """Extract random chunk from middle of text"""
    words = text.split()

    if len(words) <= target_words:
        return " ".join(words)

    # Avoid first 15 words (skip intro) and last 15 words (skip conclusion)
    safe_start = min(15, len(words) // 4)
    safe_end = len(words) - min(15, len(words) // 4)
    max_start = max(safe_start, safe_end - target_words)

    if max_start <= safe_start:
        start_index = safe_start
    else:
        start_index = random.randint(safe_start, max_start)

    chunk_words = words[start_index : start_index + target_words]
    return " ".join(chunk_words)


def clean_text(text):
    """Clean generated text"""
    # Remove extra whitespace
    text = " ".join(text.split())
    return text.strip()


def generate_batch(prompts, max_new_tokens=400):
    """
    OPTIMIZATION: Generate multiple samples at once
    """
    try:
        outputs = generator(
            prompts,
            max_new_tokens=max_new_tokens,  # CRITICAL: Use max_new_tokens instead of max_length
            do_sample=True,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            truncation=True,
            return_full_text=False,  # OPTIMIZATION: Don't return the prompt
            pad_token_id=generator.tokenizer.pad_token_id  # Explicitly set to avoid warning
        )

        # Extract generated text
        results = []
        for output in outputs:
            text = output[0]['generated_text'].strip()
            text = clean_text(text)
            results.append(text)

        return results

    except Exception as e:
        print(f"\nError in batch generation: {e}")
        return [""] * len(prompts)


# --- 4. Main Generation Loop ---

dataset = []
failed_count = 0

print(f"\nStarting generation of {NUM_SAMPLES} samples...")
print(f"Batch size: {BATCH_SIZE}")
print("=" * 70)

generation_start = time.time()

# OPTIMIZATION: Process in batches
num_batches = (NUM_SAMPLES + BATCH_SIZE - 1) // BATCH_SIZE

for batch_idx in tqdm(range(num_batches), desc="Generating batches"):
    # Prepare batch of prompts
    batch_prompts = []
    batch_topics = []

    for _ in range(BATCH_SIZE):
        if len(dataset) >= NUM_SAMPLES:
            break

        topic = random.choice(TOPICS)
        template = random.choice(TEMPLATES)
        prompt = template.format(topic=topic)

        batch_prompts.append(prompt)
        batch_topics.append(topic)

    if not batch_prompts:
        break

    # Generate batch
    raw_texts = generate_batch(batch_prompts, max_new_tokens=400)

    # Process each generated text
    for raw_text, topic in zip(raw_texts, batch_topics):
        # Check if generation was successful
        if len(raw_text.split()) < TARGET_CHUNK_LENGTH:
            failed_count += 1

            # OPTIMIZATION: Don't retry immediately, just skip
            # (Retries slow things down significantly)
            continue

        # Extract random chunk
        final_chunk = get_random_chunk(raw_text, target_words=TARGET_CHUNK_LENGTH)

        # Validate chunk
        if len(final_chunk.split()) < TARGET_CHUNK_LENGTH * 0.8:  # At least 80% of target
            failed_count += 1
            continue

        # Store formatted data
        entry = {
            "text": final_chunk,
            "label": 1
        }
        dataset.append(entry)

        if len(dataset) >= NUM_SAMPLES:
            break

    # OPTIMIZATION: Clear CUDA cache periodically
    if device == 0 and batch_idx % 50 == 0:
        torch.cuda.empty_cache()

generation_time = time.time() - generation_start

# --- 5. Handle any remaining samples needed ---
# If we didn't get enough samples due to failures, generate more
attempts = 0
max_attempts = 100

while len(dataset) < NUM_SAMPLES and attempts < max_attempts:
    topic = random.choice(TOPICS)
    template = random.choice(TEMPLATES)
    prompt = template.format(topic=topic)

    raw_texts = generate_batch([prompt], max_new_tokens=400)
    raw_text = raw_texts[0]

    if len(raw_text.split()) >= TARGET_CHUNK_LENGTH:
        final_chunk = get_random_chunk(raw_text, target_words=TARGET_CHUNK_LENGTH)

        if len(final_chunk.split()) >= TARGET_CHUNK_LENGTH * 0.8:
            entry = {
                "text": final_chunk,
                "label": 1
            }
            dataset.append(entry)

    attempts += 1

# --- 6. Save Output ---
print("\n" + "=" * 70)
print(f"Generation Complete!")
print(f"  Successful samples: {len(dataset)}")
print(f"  Failed generations: {failed_count}")
print(f"  Success rate: {len(dataset)/(len(dataset)+failed_count)*100:.1f}%")
print(f"  Total time: {generation_time/60:.2f} minutes")
print(f"  Average time per sample: {generation_time/len(dataset):.2f} seconds")

with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, indent=2, ensure_ascii=False)

print(f"\n✓ Data saved to {OUTPUT_FILENAME}")

# Statistics
word_counts = [len(entry['text'].split()) for entry in dataset]
print(f"\nDataset Statistics:")
print(f"  Average words: {sum(word_counts)/len(word_counts):.1f}")
print(f"  Min words: {min(word_counts)}")
print(f"  Max words: {max(word_counts)}")

# Show sample
print(f"\n{'='*70}")
print("Sample output (first 200 chars):")
print(dataset[0]['text'][:200] + "...")

# Clean up
if device == 0:
    torch.cuda.empty_cache()

print("\n✓ Done!")

Using device: GPU (CUDA)
GPU: Tesla T4
GPU Memory: 15.83 GB

Loading EleutherAI/gpt-neo-1.3B...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

✓ Model loaded in 119.30 seconds
✓ Tokenizer configured for batching (pad_token_id: 50256)

Starting generation of 2000 samples...
Batch size: 4


Generating batches: 100%|██████████| 500/500 [1:33:54<00:00, 11.27s/it]



Generation Complete!
  Successful samples: 1494
  Failed generations: 575
  Success rate: 72.2%
  Total time: 93.92 minutes
  Average time per sample: 3.77 seconds

✓ Data saved to ai_wiki_data.json

Dataset Statistics:
  Average words: 298.5
  Min words: 286
  Max words: 300

Sample output (first 200 chars):
esteemed scholars. In the course of our research, we have tried to present all the main theories about the Industrial Revolution, such as the first industrial revolution, the causes, the development, ...

✓ Done!


# re-generating missed samples

In [3]:
import json
import random
import torch
from transformers import pipeline, set_seed
from tqdm import tqdm
import time
import warnings
import logging

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
logging.getLogger('transformers').setLevel(logging.ERROR)

# --- Configuration ---
MODEL_NAME = "EleutherAI/gpt-neo-1.3B"
NUM_SAMPLES = 600
TARGET_CHUNK_LENGTH = 300  # Words in final cut
MIN_GENERATION_LENGTH = 350  # Reduced from 450 (you don't need that much)
OUTPUT_FILENAME = "ai_wiki_data_2.json"
SEED = 42

# OPTIMIZATION: Batch processing
BATCH_SIZE = 4  # Generate 4 samples at once (adjust based on GPU memory)

# Set Reproducibility
random.seed(SEED)
torch.manual_seed(SEED)
set_seed(SEED)

# Check for GPU
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU (CUDA)' if device == 0 else 'CPU'}")

if device == 0:
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# --- 1. Initialize Model ---
print(f"\nLoading {MODEL_NAME}...")
start_time = time.time()

# OPTIMIZATION: Enable better settings for speed
generator = pipeline(
    'text-generation',
    model=MODEL_NAME,
    device=device,
    torch_dtype=torch.float16 if device == 0 else torch.float32,  # Half precision on GPU
    batch_size=BATCH_SIZE  # Enable batch processing
)

# CRITICAL FIX: Set pad_token for batching
generator.tokenizer.pad_token_id = generator.model.config.eos_token_id
generator.tokenizer.padding_side = "left"  # Padding on left for generation

load_time = time.time() - start_time
print(f"✓ Model loaded in {load_time:.2f} seconds")
print(f"✓ Tokenizer configured for batching (pad_token_id: {generator.tokenizer.pad_token_id})")

# --- 2. Define Assets ---
TOPICS = ["Artificial Intelligence", "The Industrial Revolution", "Quantum Mechanics",
          "Photosynthesis", "The History of the Internet", "Machine Learning",
          "Renewable Energy", "Genetics", "Space Exploration", "Blockchain"
         ]

TEMPLATES = [

    "Present an encyclopedic article on {topic}. Cover definitions, important theories, and scholarly perspectives.",
    "Write a comprehensive Wikipedia-style encyclopedia entry about {topic}. Focus on its definition, history, and core principles. Use a neutral, formal tone.",
    "Provide a detailed academic overview of {topic}. Explain the fundamental principles and major developments in this field.",
]

# --- 3. Helper Functions ---

def get_random_chunk(text, target_words=300):
    """Extract random chunk from middle of text"""
    words = text.split()

    if len(words) <= target_words:
        return " ".join(words)

    # Avoid first 15 words (skip intro) and last 15 words (skip conclusion)
    safe_start = min(15, len(words) // 4)
    safe_end = len(words) - min(15, len(words) // 4)
    max_start = max(safe_start, safe_end - target_words)

    if max_start <= safe_start:
        start_index = safe_start
    else:
        start_index = random.randint(safe_start, max_start)

    chunk_words = words[start_index : start_index + target_words]
    return " ".join(chunk_words)


def clean_text(text):
    """Clean generated text"""
    # Remove extra whitespace
    text = " ".join(text.split())
    return text.strip()


def generate_batch(prompts, max_new_tokens=400):
    """
    OPTIMIZATION: Generate multiple samples at once
    """
    try:
        outputs = generator(
            prompts,
            max_new_tokens=max_new_tokens,  # CRITICAL: Use max_new_tokens instead of max_length
            do_sample=True,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            truncation=True,
            return_full_text=False,  # OPTIMIZATION: Don't return the prompt
            pad_token_id=generator.tokenizer.pad_token_id  # Explicitly set to avoid warning
        )

        # Extract generated text
        results = []
        for output in outputs:
            text = output[0]['generated_text'].strip()
            text = clean_text(text)
            results.append(text)

        return results

    except Exception as e:
        print(f"\nError in batch generation: {e}")
        return [""] * len(prompts)


# --- 4. Main Generation Loop ---

dataset = []
failed_count = 0

print(f"\nStarting generation of {NUM_SAMPLES} samples...")
print(f"Batch size: {BATCH_SIZE}")
print("=" * 70)

generation_start = time.time()

# OPTIMIZATION: Process in batches
num_batches = (NUM_SAMPLES + BATCH_SIZE - 1) // BATCH_SIZE

for batch_idx in tqdm(range(num_batches), desc="Generating batches"):
    # Prepare batch of prompts
    batch_prompts = []
    batch_topics = []

    for _ in range(BATCH_SIZE):
        if len(dataset) >= NUM_SAMPLES:
            break

        topic = random.choice(TOPICS)
        template = random.choice(TEMPLATES)
        prompt = template.format(topic=topic)

        batch_prompts.append(prompt)
        batch_topics.append(topic)

    if not batch_prompts:
        break

    # Generate batch
    raw_texts = generate_batch(batch_prompts, max_new_tokens=400)

    # Process each generated text
    for raw_text, topic in zip(raw_texts, batch_topics):
        # Check if generation was successful
        if len(raw_text.split()) < TARGET_CHUNK_LENGTH:
            failed_count += 1

            # OPTIMIZATION: Don't retry immediately, just skip
            # (Retries slow things down significantly)
            continue

        # Extract random chunk
        final_chunk = get_random_chunk(raw_text, target_words=TARGET_CHUNK_LENGTH)

        # Validate chunk
        if len(final_chunk.split()) < TARGET_CHUNK_LENGTH * 0.8:  # At least 80% of target
            failed_count += 1
            continue

        # Store formatted data
        entry = {
            "text": final_chunk,
            "label": 1
        }
        dataset.append(entry)

        if len(dataset) >= NUM_SAMPLES:
            break

    # OPTIMIZATION: Clear CUDA cache periodically
    if device == 0 and batch_idx % 50 == 0:
        torch.cuda.empty_cache()

generation_time = time.time() - generation_start

# --- 5. Handle any remaining samples needed ---
# If we didn't get enough samples due to failures, generate more
attempts = 0
max_attempts = 100

while len(dataset) < NUM_SAMPLES and attempts < max_attempts:
    topic = random.choice(TOPICS)
    template = random.choice(TEMPLATES)
    prompt = template.format(topic=topic)

    raw_texts = generate_batch([prompt], max_new_tokens=400)
    raw_text = raw_texts[0]

    if len(raw_text.split()) >= TARGET_CHUNK_LENGTH:
        final_chunk = get_random_chunk(raw_text, target_words=TARGET_CHUNK_LENGTH)

        if len(final_chunk.split()) >= TARGET_CHUNK_LENGTH * 0.8:
            entry = {
                "text": final_chunk,
                "label": 1
            }
            dataset.append(entry)

    attempts += 1

# --- 6. Save Output ---
print("\n" + "=" * 70)
print(f"Generation Complete!")
print(f"  Successful samples: {len(dataset)}")
print(f"  Failed generations: {failed_count}")
print(f"  Success rate: {len(dataset)/(len(dataset)+failed_count)*100:.1f}%")
print(f"  Total time: {generation_time/60:.2f} minutes")
print(f"  Average time per sample: {generation_time/len(dataset):.2f} seconds")

with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, indent=2, ensure_ascii=False)

print(f"\n✓ Data saved to {OUTPUT_FILENAME}")

# Statistics
word_counts = [len(entry['text'].split()) for entry in dataset]
print(f"\nDataset Statistics:")
print(f"  Average words: {sum(word_counts)/len(word_counts):.1f}")
print(f"  Min words: {min(word_counts)}")
print(f"  Max words: {max(word_counts)}")

# Show sample
print(f"\n{'='*70}")
print("Sample output (first 200 chars):")
print(dataset[0]['text'][:200] + "...")

# Clean up
if device == 0:
    torch.cuda.empty_cache()

print("\n✓ Done!")

Using device: GPU (CUDA)
GPU: Tesla T4
GPU Memory: 15.83 GB

Loading EleutherAI/gpt-neo-1.3B...
✓ Model loaded in 38.00 seconds
✓ Tokenizer configured for batching (pad_token_id: 50256)

Starting generation of 600 samples...
Batch size: 4


Generating batches: 100%|██████████| 150/150 [28:10<00:00, 11.27s/it]



Generation Complete!
  Successful samples: 494
  Failed generations: 175
  Success rate: 73.8%
  Total time: 28.18 minutes
  Average time per sample: 3.42 seconds

✓ Data saved to ai_wiki_data_2.json

Dataset Statistics:
  Average words: 298.6
  Min words: 286
  Max words: 300

Sample output (first 200 chars):
esteemed scholars. In the course of our research, we have tried to present all the main theories about the Industrial Revolution, such as the first industrial revolution, the causes, the development, ...

✓ Done!
