In [1]:
#!/usr/bin/env python3

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from tqdm import tqdm
import numpy as np


def calculate_perplexity_batch(texts, model, tokenizer, max_length=2048):
    """Calculate perplexity for a batch of texts."""
    try:
        encodings = tokenizer(
            texts,
            return_tensors='pt',
            max_length=max_length,
            truncation=True,
            padding=False  # No padding
        )
        encodings = {k: v.to(model.device) for k, v in encodings.items()}
        
        with torch.no_grad():
            outputs = model(**encodings)
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = encodings['input_ids'][..., 1:].contiguous()
            
            loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
            losses = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1)),
                shift_labels.view(-1)
            )
            
            # Calculate mean loss
            loss = losses.mean()
            perplexity = torch.exp(loss).cpu().item()
            
        return [perplexity]  # Return as list for consistency
    
    except Exception as e:
        print(f"Error in batch: {e}")
        return [float('inf')]

In [2]:
def main():
    MODEL_ID = "meta-llama/Llama-3.1-8B"
    NUM_SAMPLES = 128
    BATCH_SIZE = 32
    SEQUENCE_LENGTH = 2048
    INITIAL_SAMPLES = 1000

    print("Loading DCLM-micro dataset...")
    ds = load_dataset("robbiegwaldd/dclm-micro")

    print(f"Loading {MODEL_ID} for perplexity scoring...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
    )

    indices = [int(i) for i in np.random.choice(len(ds['train']), INITIAL_SAMPLES, replace=False)]
    initial_data = ds['train'].select(indices)
    texts = initial_data['text']
    valid_texts = [text[:SEQUENCE_LENGTH] for text in texts if len(text) >= SEQUENCE_LENGTH]
    print(f"Found {len(valid_texts)} valid sequences")

    perplexities = []
    for i in tqdm(range(0, len(valid_texts), BATCH_SIZE), desc="Calculating perplexities"):
        batch = valid_texts[i:i + BATCH_SIZE]
        try:
            batch_perplexities = calculate_perplexity_batch(batch, model, tokenizer)
            perplexities.extend(zip(batch, batch_perplexities))
        except Exception as e:
            print(f"Error processing batch starting at index {i}: {e}")
            continue

    print("Selecting best sequences...")
    perplexities = [(text, score) for text, score in perplexities if score != float('inf')]
    best_sequences = sorted(perplexities, key=lambda x: x[1])[:NUM_SAMPLES]
    calibration_texts = [seq for seq, _ in best_sequences]
    calibration_data = Dataset.from_dict({'text': calibration_texts})

    print(f"Selected {len(calibration_texts)} sequences for calibration")
    print("\nPerplexity scores of selected sequences:")
    for i, (_, score) in enumerate(best_sequences[:5]):
        print(f"Sequence {i + 1}: {score:.2f}")

    print("\nSaving perplexity scores...")
    with open('perplexity_scores.txt', 'w') as f:
        for text, score in best_sequences:
            f.write(f"{score:.4f}\t{text[:100]}...\n")

    del model
    torch.cuda.empty_cache()
    return calibration_data


if __name__ == '__main__':
    calibration_data = main()

Loading DCLM-micro dataset...
Loading meta-llama/Llama-3.1-8B for perplexity scoring...


  warn(


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Found 642 valid sequences


Calculating perplexities:  48%|████▊     | 10/21 [00:00<00:00, 96.88it/s]

Error in batch: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Error in batch: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Error in batch: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Error in batch: Unable to create tensor, you should probably activate truncation and/or pad

Calculating perplexities: 100%|██████████| 21/21 [00:00<00:00, 83.42it/s]


Error in batch: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Error in batch: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Error in batch: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Selecting best sequences...
Selected 0 sequences for calibration

Perplexity scores of sele

In [3]:
#!/usr/bin/env python3
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from tqdm import tqdm
import numpy as np

def calculate_perplexity(text, model, tokenizer, max_length=2048):
    """Calculate perplexity for a single text."""
    try:
        # Ensure text is a string and truncate if needed
        if isinstance(text, list):
            text = ' '.join(text)
        text = text[:max_length * 4]  # Rough estimate for token/char ratio
        
        # Tokenize
        input_ids = tokenizer.encode(text, return_tensors='pt').to(model.device)
        
        # Truncate if still too long
        if input_ids.size(1) > max_length:
            input_ids = input_ids[:, :max_length]
        
        with torch.no_grad():
            outputs = model(input_ids)
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = input_ids[..., 1:].contiguous()
            
            loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), 
                          shift_labels.view(-1))
            
        return torch.exp(loss).cpu().item()
    except Exception as e:
        print(f"Error processing text: {e}")
        return float('inf')

def main():
    MODEL_ID = "meta-llama/Llama-3.1-8B"
    NUM_SAMPLES = 128
    SEQUENCE_LENGTH = 2048
    INITIAL_SAMPLES = 1000
    
    print("Loading DCLM-micro dataset...")
    ds = load_dataset("robbiegwaldd/dclm-micro")
    
    print(f"Loading {MODEL_ID} for perplexity scoring...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    
    # Sample initial sequences
    indices = [int(i) for i in np.random.choice(len(ds['train']), INITIAL_SAMPLES, replace=False)]
    initial_data = ds['train'].select(indices)
    texts = initial_data['text']
    
    # Filter valid sequences
    valid_texts = []
    print("Filtering sequences...")
    for text in texts:
        if len(text) >= 100:  # Minimum length check
            valid_texts.append(text[:SEQUENCE_LENGTH * 4])  # Rough char limit
    print(f"Found {len(valid_texts)} valid sequences")
    
    # Calculate perplexity for each sequence
    perplexities = []
    for text in tqdm(valid_texts, desc="Calculating perplexities"):
        perplexity = calculate_perplexity(text, model, tokenizer)
        if perplexity != float('inf'):
            perplexities.append((text, perplexity))
    
    print(f"Successfully processed {len(perplexities)} sequences")
    
    # Sort and select best sequences
    print("Selecting best sequences...")
    best_sequences = sorted(perplexities, key=lambda x: x[1])[:NUM_SAMPLES]
    calibration_texts = [seq for seq, _ in best_sequences]
    
    # Create dataset
    calibration_data = Dataset.from_dict({'text': calibration_texts})
    
    print(f"Selected {len(calibration_texts)} sequences for calibration")
    print("\nTop 5 sequences perplexity scores:")
    for i, (_, score) in enumerate(best_sequences[:5]):
        print(f"Sequence {i + 1}: {score:.2f}")
    
    # Save results
    print("\nSaving perplexity scores...")
    with open('perplexity_scores.txt', 'w', encoding='utf-8') as f:
        for text, score in best_sequences:
            f.write(f"{score:.4f}\t{text[:100]}...\n")
    
    del model
    torch.cuda.empty_cache()
    
    return calibration_data

if __name__ == "__main__":
    calibration_data = main()

Loading DCLM-micro dataset...
Loading meta-llama/Llama-3.2-1B for perplexity scoring...


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Filtering sequences...
Found 1000 valid sequences


Calculating perplexities: 100%|██████████| 1000/1000 [00:35<00:00, 28.28it/s]


Successfully processed 1000 sequences
Selecting best sequences...
Selected 128 sequences for calibration

Top 5 sequences perplexity scores:
Sequence 1: 1.23
Sequence 2: 2.98
Sequence 3: 3.19
Sequence 4: 3.71
Sequence 5: 4.04

Saving perplexity scores...


In [4]:
#!/usr/bin/env python3
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from tqdm import tqdm
import pandas as pd
import csv

def calculate_perplexity(text, model, tokenizer, max_length=2048):
    """Calculate perplexity for a single text."""
    try:
        # Ensure text is a string and truncate if needed
        if isinstance(text, list):
            text = ' '.join(text)
        text = text[:max_length * 4]  # Rough estimate for token/char ratio
        
        # Tokenize
        input_ids = tokenizer.encode(text, return_tensors='pt').to(model.device)
        
        # Truncate if still too long
        if input_ids.size(1) > max_length:
            input_ids = input_ids[:, :max_length]
        
        with torch.no_grad():
            outputs = model(input_ids)
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = input_ids[..., 1:].contiguous()
            
            loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), 
                          shift_labels.view(-1))
            
        return torch.exp(loss).cpu().item()
    except Exception as e:
        print(f"Error processing text: {e}")
        return float('inf')

def main():
    MODEL_ID = "meta-llama/Llama-3.1-8B"
    SEQUENCE_LENGTH = 2048
    CSV_FILE = 'dclm_perplexities.csv'
    
    print("Loading DCLM-micro dataset...")
    ds = load_dataset("robbiegwaldd/dclm-micro")
    texts = ds['train']['text']
    
    print(f"Loading {MODEL_ID} for perplexity scoring...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    
    # Open CSV file and write header
    with open(CSV_FILE, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['index', 'text', 'perplexity'])
        
        # Process all sequences
        for idx, text in enumerate(tqdm(texts, desc="Calculating perplexities")):
            if len(text) >= 100:  # Minimum length check
                perplexity = calculate_perplexity(text, model, tokenizer)
                # Write immediately to CSV to save progress
                writer.writerow([idx, text[:1000], perplexity])  # Truncate text for CSV
            
            # Save progress every 100 sequences
            if idx % 100 == 0:
                f.flush()
    
    print(f"\nResults saved to {CSV_FILE}")
    
    # Load and display some statistics
    df = pd.read_csv(CSV_FILE)
    print("\nDataset statistics:")
    print(f"Total sequences processed: {len(df)}")
    print(f"Average perplexity: {df['perplexity'].mean():.2f}")
    print(f"Median perplexity: {df['perplexity'].median():.2f}")
    print("\nTop 5 sequences by perplexity:")
    print(df.nsmallest(5, 'perplexity')[['index', 'perplexity']])
    
    del model
    torch.cuda.empty_cache()

if __name__ == "__main__":
    main()

Loading DCLM-micro dataset...
Loading meta-llama/Llama-3.1-8B for perplexity scoring...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating perplexities:   0%|          | 680/305259 [01:43<12:49:54,  6.59it/s]


KeyboardInterrupt: 

In [None]:
#!/usr/bin/env python3
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import numpy as np

def calculate_batch_perplexity(texts, model, tokenizer, max_length=2048):
    """Calculate perplexity for a batch of texts."""
    try:
        # Tokenize batch
        encodings = tokenizer(
            texts,
            return_tensors='pt',
            max_length=max_length,
            truncation=True,
            padding=True
        ).to(model.device)
        
        with torch.no_grad():
            outputs = model(**encodings)
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = encodings['input_ids'][..., 1:].contiguous()
            shift_mask = encodings['attention_mask'][..., 1:].contiguous()
            
            # Calculate loss for each sequence
            loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
            losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), 
                            shift_labels.view(-1))
            
            # Reshape and apply mask
            losses = losses.view(shift_labels.size()) * shift_mask
            
            # Calculate mean loss per sequence
            seq_lengths = shift_mask.sum(dim=1)
            seq_losses = losses.sum(dim=1) / seq_lengths
            
            # Convert to perplexity
            perplexities = torch.exp(seq_losses).cpu().numpy()
            
        return perplexities
    except Exception as e:
        print(f"Error in batch: {e}")
        return np.array([float('inf')] * len(texts))

def main():
    MODEL_ID = "meta-llama/Llama-3.1-8B"
    BATCH_SIZE = 8  # Adjust based on your GPU memory
    SEQUENCE_LENGTH = 512
    CSV_FILE = 'dclm_perplexities.csv'
    
    print("Loading DCLM-micro dataset...")
    ds = load_dataset("robbiegwaldd/dclm-micro")
    
    print(f"Loading {MODEL_ID} for perplexity scoring...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    
    # Process in batches and save results
    results = []
    texts = ds['train']['text']
    
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Processing batches"):
        batch_texts = texts[i:i + BATCH_SIZE]
        # Filter out too short sequences
        valid_indices = [j for j, text in enumerate(batch_texts) if len(text) >= 100]
        if not valid_indices:
            continue
            
        valid_texts = [batch_texts[j] for j in valid_indices]
        batch_perplexities = calculate_batch_perplexity(valid_texts, model, tokenizer)
        
        # Save results
        for idx, perp in zip(range(i, i + len(valid_texts)), batch_perplexities):
            results.append({
                'index': idx,
                'text': texts[idx][:1000],  # Truncate text for CSV
                'perplexity': float(perp)
            })
        
        # Save progress periodically
        if len(results) % 100 == 0:
            df = pd.DataFrame(results)
            df.to_csv(CSV_FILE, index=False)
    
    # Save final results
    df = pd.DataFrame(results)
    df.to_csv(CSV_FILE, index=False)
    
    print(f"\nResults saved to {CSV_FILE}")
    print("\nDataset statistics:")
    print(f"Total sequences processed: {len(df)}")
    print(f"Average perplexity: {df['perplexity'].mean():.2f}")
    print(f"Median perplexity: {df['perplexity'].median():.2f}")
    print("\nTop 5 sequences by perplexity:")
    print(df.nsmallest(5, 'perplexity')[['index', 'perplexity']])
    
    del model
    torch.cuda.empty_cache()

if __name__ == "__main__":
    main()

Loading DCLM-micro dataset...
Loading meta-llama/Llama-3.1-8B for perplexity scoring...


  warn(


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing batches:   0%|          | 47/38158 [02:09<28:47:27,  2.72s/it]

In [None]:
#!/usr/bin/env python3
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import numpy as np

def filter_by_token_length(texts, tokenizer, min_length=100, max_length=512):
    """Filter texts by their tokenized length."""
    valid_texts = []
    valid_indices = []
    
    for idx, text in enumerate(texts):
        tokens = tokenizer(text, truncation=False)['input_ids']
        token_length = len(tokens)
        if min_length <= token_length <= max_length:
            valid_texts.append(text)
            valid_indices.append(idx)
    
    return valid_texts, valid_indices

def calculate_batch_perplexity(texts, model, tokenizer, max_length=512):
    """Calculate perplexity for a batch of texts."""
    try:
        encodings = tokenizer(
            texts,
            return_tensors='pt',
            max_length=max_length,
            truncation=True,
            padding=True
        ).to(model.device)
        
        with torch.no_grad():
            outputs = model(**encodings)
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = encodings['input_ids'][..., 1:].contiguous()
            shift_mask = encodings['attention_mask'][..., 1:].contiguous()
            
            loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
            losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), 
                            shift_labels.view(-1))
            
            losses = losses.view(shift_labels.size()) * shift_mask
            seq_lengths = shift_mask.sum(dim=1)
            seq_losses = losses.sum(dim=1) / seq_lengths
            perplexities = torch.exp(seq_losses).cpu().numpy()
            
        return perplexities
    except Exception as e:
        print(f"Error in batch: {e}")
        return np.array([float('inf')] * len(texts))

def main():
    MODEL_ID = "meta-llama/Llama-3.1-8B"
    BATCH_SIZE = 64
    SEQUENCE_LENGTH = 512
    CSV_FILE = 'dclm_perplexities_512.csv'
    
    print("Loading DCLM-micro dataset...")
    ds = load_dataset("robbiegwaldd/dclm-micro")
    
    print(f"Loading {MODEL_ID} for perplexity scoring...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    model.eval()
    
    results = []
    texts = ds['train']['text']
    
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Processing batches"):
        batch_texts = texts[i:i + BATCH_SIZE]
        
        # Filter by token length
        valid_texts, valid_indices = filter_by_token_length(
            batch_texts, 
            tokenizer, 
            min_length=100, 
            max_length=SEQUENCE_LENGTH
        )
        
        if not valid_texts:
            continue
        
        batch_perplexities = calculate_batch_perplexity(valid_texts, model, tokenizer)
        
        # Save results using original indices
        for local_idx, perp in zip(valid_indices, batch_perplexities):
            results.append({
                'index': i + local_idx,
                'text': batch_texts[local_idx][:1000],  # Truncate text for CSV
                'perplexity': float(perp),
                'token_length': len(tokenizer(batch_texts[local_idx])['input_ids'])
            })
        
        # Save progress periodically
        if len(results) % 100 == 0:
            df = pd.DataFrame(results)
            df.to_csv(CSV_FILE, index=False)
    
    # Save final results
    df = pd.DataFrame(results)
    df.to_csv(CSV_FILE, index=False)
    
    print(f"\nResults saved to {CSV_FILE}")
    print("\nDataset statistics:")
    print(f"Total sequences processed: {len(df)}")
    print(f"Average perplexity: {df['perplexity'].mean():.2f}")
    print(f"Median perplexity: {df['perplexity'].median():.2f}")
    print(f"Average token length: {df['token_length'].mean():.2f}")
    print("\nTop 5 sequences by perplexity:")
    print(df.nsmallest(5, 'perplexity')[['index', 'token_length', 'perplexity']])
    
    del model
    torch.cuda.empty_cache()

if __name__ == "__main__":
    main()

Loading DCLM-micro dataset...
Loading meta-llama/Llama-3.1-8B for perplexity scoring...


  warn(


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing batches:   3%|▎         | 144/4770 [05:29<2:56:17,  2.29s/it]