## 1. Import Libraries

**Goal:** Fine-tune an LLM to predict execution times for CUDA kernel configurations

In [None]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset

print(f"PyTorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

## 2. Configuration

In [None]:
# Configuration
CSV_FILE = "klaraptor_enriched_data.csv"  # Path to your enriched data
MODEL_NAME = "gpt2"  # or "gpt2-medium", "microsoft/phi-2", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "cuda_exec_time_predictor_llm"

# Training parameters
NUM_EPOCHS = 10
BATCH_SIZE = 4
LEARNING_RATE = 5e-5
MAX_LENGTH = 512

# Time filtering (optional - set to None to use all data)
TIME_REGIME = None  # Options: "fast" (<1ms), "medium" (1-100ms), "slow" (>100ms), None (all)

print(f"Configuration:")
print(f"  Data file: {CSV_FILE}")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Time regime: {TIME_REGIME if TIME_REGIME else 'All'}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")

## 3. Define Helper Functions

In [None]:
def create_prompt(row):
    """
    Create a text prompt for execution time prediction
    
    Format:
    ### Kernel: Convolution2D_kernel
    N: 1024
    dimensionality: 2D
    compute_intensity: 10.50
    Configuration: block_dims=(16, 16, 1), total_threads=256
    ### Predicted Execution Time:
    exec_time: 5.234 ms
    log_time: 1.655
    """
    total_threads = int(row['block_x'] * row['block_y'])
    log_time = np.log(row['exec_time'])
    
    prompt = f"""### Kernel: {row['kernel_name']}
N: {int(row['N'])}
dimensionality: {int(row['dimensionality'])}D
compute_intensity: {row['compute_intensity']:.2f}
has_shared_memory: {row['has_shared_memory']}
global_reads: {int(row['global_reads'])}
global_writes: {int(row['global_writes'])}
arithmetic_ops: {int(row['arithmetic_ops'])}
memory_ops: {int(row['memory_ops'])}
Configuration: block_dims=({int(row['block_x'])}, {int(row['block_y'])}, 1), total_threads={total_threads}

### Predicted Execution Time:
exec_time: {row['exec_time']:.6f} ms
log_time: {log_time:.4f}
"""
    return prompt

# Test the prompt function
test_row = {
    'kernel_name': 'Convolution2D_kernel',
    'N': 1024,
    'dimensionality': 2,
    'compute_intensity': 10.5,
    'has_shared_memory': True,
    'global_reads': 15,
    'global_writes': 5,
    'arithmetic_ops': 210,
    'memory_ops': 20,
    'block_x': 16,
    'block_y': 16,
    'exec_time': 5.234567
}

print("Example prompt:")
print(create_prompt(test_row))

## 4. Load and Prepare Data

In [None]:
# Load the CSV
df = pd.read_csv(CSV_FILE)

# Handle column name variations
column_mapping = {
    'kernel': 'kernel_name',
    'bx': 'block_x',
    'by': 'block_y',
    'bz': 'block_z'
}
df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})

# Ensure required columns exist
if 'block_x' not in df.columns:
    df['block_x'] = df['bx'] if 'bx' in df.columns else 0
if 'block_y' not in df.columns:
    df['block_y'] = df['by'] if 'by' in df.columns else 0

# Apply time filtering if specified
if TIME_REGIME == "fast":
    df = df[df['exec_time'] < 1.0]
    print(f"Filtering for FAST regime: exec_time < 1ms")
elif TIME_REGIME == "medium":
    df = df[(df['exec_time'] >= 1.0) & (df['exec_time'] < 100.0)]
    print(f"Filtering for MEDIUM regime: 1ms ≤ exec_time < 100ms")
elif TIME_REGIME == "slow":
    df = df[df['exec_time'] >= 100.0]
    print(f"Filtering for SLOW regime: exec_time ≥ 100ms")

print(f"\nLoaded {len(df):,} total configurations")
print(f"Kernels: {df['kernel_name'].nunique()}")
print(f"Data sizes (N): {sorted(df['N'].unique())}")
print(f"Exec time range: {df['exec_time'].min():.6f} - {df['exec_time'].max():.6f} ms")
print(f"Log exec time range: {np.log(df['exec_time'].min()):.2f} - {np.log(df['exec_time'].max()):.2f}")

if 'gpu' in df.columns:
    print(f"GPUs: {df['gpu'].nunique()} ({', '.join(df['gpu'].unique())})")

# Show sample
df.head()

## 5. Sample Data for Training

For LLMs, we'll use a subset of data (too much data can be slow). We'll sample diverse configurations.

In [None]:
# For LLM training, we'll use a stratified sample to keep training time reasonable
# Sample proportionally from each kernel
MAX_SAMPLES = 5000  # Adjust based on your GPU/CPU capacity

if len(df) > MAX_SAMPLES:
    print(f"Sampling {MAX_SAMPLES} examples from {len(df):,} total...")
    
    # Stratified sampling: same proportion from each kernel
    df_sampled = df.groupby('kernel_name', group_keys=False).apply(
        lambda x: x.sample(min(len(x), MAX_SAMPLES // df['kernel_name'].nunique()), random_state=42)
    ).reset_index(drop=True)
    
    print(f"✓ Sampled {len(df_sampled):,} examples")
else:
    df_sampled = df.copy()
    print(f"Using all {len(df_sampled):,} examples")

# Distribution stats
print(f"\nSampled data statistics:")
print(f"  Kernels: {df_sampled['kernel_name'].nunique()}")
print(f"  Configs per kernel: {len(df_sampled) / df_sampled['kernel_name'].nunique():.1f} avg")
print(f"  Exec time range: {df_sampled['exec_time'].min():.6f} - {df_sampled['exec_time'].max():.6f} ms")

# Show distribution by kernel
df_sampled.groupby('kernel_name').agg({
    'exec_time': ['count', 'min', 'max', 'mean']
}).round(4).head(10)

## 6. Create Training Prompts

In [None]:
# Create prompts for all sampled configs
df_sampled['text'] = df_sampled.apply(create_prompt, axis=1)

print(f"Created {len(df_sampled)} training examples")
print("\nExample training prompt:")
print("=" * 70)
print(df_sampled['text'].iloc[0])
print("=" * 70)

## 7. Load Model and Tokenizer

In [None]:
print(f"Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set pad token (GPT-2 doesn't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model (use float32 to avoid FP16 gradient issues)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto" if torch.cuda.is_available() else None
)

print(f"✓ Model loaded: {MODEL_NAME}")
print(f"  Parameters: {model.num_parameters():,}")
print(f"  Device: {next(model.parameters()).device}")

## 8. Prepare Dataset for Training

In [None]:
# Create HuggingFace dataset
dataset = Dataset.from_pandas(df_optimal[['text']])

def tokenize_function(examples):
    """Tokenize texts for causal language modeling"""
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length'
    )
    # For causal LM, labels are the same as input_ids
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

# Tokenize dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

# Split train/val (90/10)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

print(f"✓ Dataset prepared")
print(f"  Training samples: {len(split_dataset['train'])}")
print(f"  Validation samples: {len(split_dataset['test'])}")

## 9. Configure Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_steps=100,
    save_steps=100,
    eval_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),  # Use bfloat16 if supported
    fp16=False,  # Disable fp16 to avoid gradient scaler issues
    push_to_hub=False,
    report_to="none"  # Disable wandb/tensorboard
)

# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=data_collator,
)

print("✓ Trainer configured")

## 10. Train the Model

**Note:** This will take some time depending on your hardware.
- CPU: 1-2 hours
- GPU: 10-30 minutes

In [None]:
print("Starting training...")
print("=" * 60)

# Train
trainer.train()

print("=" * 60)
print("✓ Training complete!")

## 11. Save the Model

In [None]:
# Save final model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✓ Model saved to: {OUTPUT_DIR}")
print(f"  - Model weights: {OUTPUT_DIR}/pytorch_model.bin")
print(f"  - Tokenizer: {OUTPUT_DIR}/tokenizer_config.json")

## 12. Test Prediction Function

In [None]:
def predict_exec_time_with_llm(model, tokenizer, kernel_info, block_x, block_y):
    """
    Use fine-tuned LLM to predict execution time for a given configuration
    
    Args:
        kernel_info: dict with kernel characteristics
        block_x, block_y: block dimensions to predict for
        
    Returns:
        (predicted_exec_time, predicted_log_time, generated_text)
    """
    total_threads = block_x * block_y
    
    # Create input prompt (without the output part)
    prompt = f"""### Kernel: {kernel_info['kernel_name']}
N: {kernel_info['N']}
dimensionality: {kernel_info['dimensionality']}D
compute_intensity: {kernel_info['compute_intensity']:.2f}
has_shared_memory: {kernel_info['has_shared_memory']}
global_reads: {kernel_info['global_reads']}
global_writes: {kernel_info['global_writes']}
arithmetic_ops: {kernel_info['arithmetic_ops']}
memory_ops: {kernel_info['memory_ops']}
Configuration: block_dims=({block_x}, {block_y}, 1), total_threads={total_threads}

### Predicted Execution Time:
"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Get the device the model is on
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.1,  # Low temperature for more deterministic output
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract exec_time from generated text
    # Look for pattern like "exec_time: 5.234 ms"
    match = re.search(r'exec_time:\s*([\d.]+)\s*ms', generated_text)
    if match:
        exec_time = float(match.group(1))
    else:
        # Try to extract log_time instead
        match_log = re.search(r'log_time:\s*([-\d.]+)', generated_text)
        if match_log:
            exec_time = np.exp(float(match_log.group(1)))
        else:
            print("⚠ Warning: Could not parse LLM output, using fallback estimate")
            # Rough estimate based on problem size
            exec_time = (kernel_info['N'] ** kernel_info['dimensionality']) / 1e6
    
    log_time = np.log(exec_time)
    
    return exec_time, log_time, generated_text


def find_optimal_config_with_llm(model, tokenizer, kernel_info, candidate_configs=None):
    """
    Find optimal block configuration by testing multiple configs with LLM
    
    Returns:
        (best_block_x, best_block_y, predicted_exec_time, all_predictions)
    """
    # Generate candidate configs
    if candidate_configs is None:
        dim = kernel_info.get('dimensionality', 1)
        if dim == 1:
            candidate_configs = [(32, 1), (64, 1), (128, 1), (256, 1), (512, 1), (1024, 1)]
        else:
            candidate_configs = [
                (8, 8), (16, 8), (16, 16), (32, 8), (32, 16), (32, 32),
                (64, 4), (64, 8), (64, 16), (128, 4), (128, 8), (256, 4)
            ]
    
    # Predict for all configs
    predictions = []
    for block_x, block_y in candidate_configs:
        exec_time, log_time, _ = predict_exec_time_with_llm(
            model, tokenizer, kernel_info, block_x, block_y
        )
        predictions.append({
            'block_x': block_x,
            'block_y': block_y,
            'predicted_time': exec_time,
            'log_time': log_time
        })
    
    # Find best
    best = min(predictions, key=lambda x: x['predicted_time'])
    
    return best['block_x'], best['block_y'], best['predicted_time'], predictions

print("✓ Prediction functions defined")

## 13. Test Predictions

In [None]:
# Test predictions for different kernels and configurations
test_cases = [
    {
        'kernel_name': 'Convolution2D_kernel',
        'N': 1024,
        'dimensionality': 2,
        'compute_intensity': 10.5,
        'has_shared_memory': True,
        'global_reads': 15,
        'global_writes': 5,
        'arithmetic_ops': 210,
        'memory_ops': 20
    },
    {
        'kernel_name': 'mm2_kernel1',
        'N': 2048,
        'dimensionality': 2,
        'compute_intensity': 15.2,
        'has_shared_memory': True,
        'global_reads': 20,
        'global_writes': 10,
        'arithmetic_ops': 304,
        'memory_ops': 30
    }
]

print("TEST PREDICTIONS - FINDING OPTIMAL CONFIGS")
print("=" * 80)

for test_kernel in test_cases:
    print(f"\nKernel: {test_kernel['kernel_name']}, N={test_kernel['N']}")
    print("-" * 80)
    
    bx, by, pred_time, all_preds = find_optimal_config_with_llm(
        model, tokenizer, test_kernel
    )
    
    print(f"✓ Optimal configuration:")
    print(f"    block_dims: ({bx}, {by}, 1)")
    print(f"    predicted_time: {pred_time:.6f} ms")
    
    print(f"\n  Top 5 configurations:")
    for pred in sorted(all_preds, key=lambda x: x['predicted_time'])[:5]:
        print(f"    ({pred['block_x']:4d}, {pred['block_y']:4d}): {pred['predicted_time']:.6f} ms")

print("\n" + "=" * 80)

## 14. Evaluate on Validation Set (Optional)

In [None]:
# Evaluate on validation set - compare predicted vs actual execution times
val_samples = df_sampled.sample(min(20, len(df_sampled)), random_state=42)

predictions_list = []
actuals_list = []

print("Validation Set Predictions")
print("=" * 90)
print(f"{'Kernel':<25} {'Config':<15} {'Predicted':<15} {'Actual':<15} {'Error %'}")
print("=" * 90)

for _, row in val_samples.iterrows():
    kernel_info = {
        'kernel_name': row['kernel_name'],
        'N': int(row['N']),
        'dimensionality': int(row['dimensionality']),
        'compute_intensity': float(row['compute_intensity']),
        'has_shared_memory': bool(row['has_shared_memory']),
        'global_reads': int(row['global_reads']),
        'global_writes': int(row['global_writes']),
        'arithmetic_ops': int(row['arithmetic_ops']),
        'memory_ops': int(row['memory_ops'])
    }
    
    bx = int(row['block_x'])
    by = int(row['block_y'])
    
    pred_time, _, _ = predict_exec_time_with_llm(model, tokenizer, kernel_info, bx, by)
    actual_time = row['exec_time']
    
    error_pct = abs(pred_time - actual_time) / actual_time * 100
    
    predictions_list.append(pred_time)
    actuals_list.append(actual_time)
    
    print(f"{row['kernel_name'][:25]:<25} ({bx:3d},{by:3d})      "
          f"{pred_time:>10.6f} ms  {actual_time:>10.6f} ms  {error_pct:>6.1f}%")

# Calculate metrics
predictions_arr = np.array(predictions_list)
actuals_arr = np.array(actuals_list)

mae = np.mean(np.abs(predictions_arr - actuals_arr))
mape = np.mean(np.abs((actuals_arr - predictions_arr) / actuals_arr)) * 100
r2 = 1 - np.sum((actuals_arr - predictions_arr)**2) / np.sum((actuals_arr - actuals_arr.mean())**2)

within_10 = (np.abs((actuals_arr - predictions_arr) / actuals_arr) < 0.1).mean() * 100
within_20 = (np.abs((actuals_arr - predictions_arr) / actuals_arr) < 0.2).mean() * 100

print("=" * 90)
print(f"Metrics:")
print(f"  MAE: {mae:.6f} ms")
print(f"  MAPE: {mape:.2f}%")
print(f"  R²: {r2:.4f}")
print(f"  Within 10%: {within_10:.1f}%")
print(f"  Within 20%: {within_20:.1f}%")

## 15. Summary

You've successfully:
1. ✓ Loaded the KLARAPTOR dataset with execution time data
2. ✓ Sampled diverse configurations for training
3. ✓ Created prompts that map (kernel features + block config) → execution time
4. ✓ Fine-tuned an LLM to predict execution times
5. ✓ Tested predictions and found optimal block configurations

**Key Differences from Direct Block Prediction:**
- **Input:** Kernel features + proposed block configuration
- **Output:** Predicted execution time (in ms and log scale)
- **Optimization:** Test multiple configs, choose one with lowest predicted time
- **Advantage:** More data (all 15K+ configs vs. only ~200 optimal ones)

**Next Steps:**
- Try different LLM sizes (gpt2-medium for better accuracy)
- Experiment with time filtering (fast/medium/slow regimes)
- Compare with neural network ensemble approach
- Integrate with `grid_block_suggester.py` for end-to-end optimization

**Comparison: LLM vs Neural Network**
- **LLM:** More flexible, can handle text-based reasoning, slower inference
- **Neural Network:** Faster inference, better for numerical prediction, easier to optimize
- **Recommendation:** Use neural network ensemble for production (better performance)