## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset

print(f"PyTorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

## 2. Configuration

In [None]:
# Configuration
CSV_FILE = "klaraptor_enriched_data.csv"  # Path to your enriched data
MODEL_NAME = "gpt2"  # or "gpt2-medium", "microsoft/phi-2", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "cuda_block_predictor_llm"
INCLUDE_GPU = False  # Set True to optimize per GPU, False for general optimization

# Training parameters
NUM_EPOCHS = 10
BATCH_SIZE = 4
LEARNING_RATE = 5e-5
MAX_LENGTH = 512

print(f"Configuration:")
print(f"  Data file: {CSV_FILE}")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")

## 3. Define Helper Functions

In [None]:
def create_prompt(row):
    """
    Create a text prompt for the LLM
    
    Format:
    ### Kernel: vectorAdd
    N: 4096
    dimensionality: 1D
    ...
    ### Optimal Configuration:
    block_dims: (256, 1, 1)
    total_threads: 256
    """
    prompt = f"""### Kernel: {row['kernel_name']}
    N: {int(row['N'])}
    dimensionality: {int(row['dimensionality'])}D
    compute_intensity: {row['compute_intensity']:.2f}
    has_shared_memory: {row['has_shared_memory']}
    global_reads: {int(row['global_reads'])}
    global_writes: {int(row['global_writes'])}
    arithmetic_ops: {int(row['arithmetic_ops'])}
    memory_ops: {int(row['memory_ops'])}

    ### Optimal Configuration:
    block_dims: ({int(row['block_x'])}, {int(row['block_y'])}, 1)
    total_threads: {int(row['block_x'] * row['block_y'])}
    """
    return prompt

# Test the prompt function
test_row = {
    'kernel_name': 'vectorAdd',
    'N': 4096,
    'dimensionality': 1,
    'compute_intensity': 12.5,
    'has_shared_memory': False,
    'global_reads': 10,
    'global_writes': 5,
    'arithmetic_ops': 125,
    'memory_ops': 15,
    'block_x': 256,
    'block_y': 1
}

print("Example prompt:")
print(create_prompt(test_row))

## 4. Load and Prepare Data

In [None]:
# Load the CSV
df = pd.read_csv(CSV_FILE)

# Handle column name variations
column_mapping = {
    'kernel': 'kernel_name',
    'bx': 'block_x',
    'by': 'block_y',
    'bz': 'block_z'
}
df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})

# Ensure required columns exist
if 'block_x' not in df.columns:
    df['block_x'] = df['bx'] if 'bx' in df.columns else 0
if 'block_y' not in df.columns:
    df['block_y'] = df['by'] if 'by' in df.columns else 0

print(f"Loaded {len(df)} total configurations")
print(f"Kernels: {df['kernel_name'].nunique()}")
print(f"Data sizes (N): {sorted(df['N'].unique())}")
if 'gpu' in df.columns:
    print(f"GPUs: {df['gpu'].nunique()} ({', '.join(df['gpu'].unique())})")

# Show sample
df.head()

## 5. Find Optimal Configurations

In [None]:
# Group by kernel, N, and optionally GPU
group_cols = ['kernel_name', 'N']
if INCLUDE_GPU and 'gpu' in df.columns:
    group_cols.append('gpu')

# Find optimal configs (min exec time for each group)
print("Finding optimal configurations...")
optimal_rows = []

for group_key, group in df.groupby(group_cols):
    # Find row with minimum execution time
    best_idx = group['exec_time'].idxmin()
    best_row = group.loc[best_idx].copy()
    
    # Add some context: how much better is this than average?
    avg_time = group['exec_time'].mean()
    best_time = best_row['exec_time']
    speedup = avg_time / best_time
    
    best_row['speedup_vs_avg'] = speedup
    best_row['configs_tested'] = len(group)
    
    optimal_rows.append(best_row)

df_optimal = pd.DataFrame(optimal_rows)

print(f"✓ Found {len(df_optimal)} optimal configurations")
print(f"  Configs per kernel: {len(df_optimal) / df['kernel_name'].nunique():.1f} avg")
print(f"  Avg speedup vs random config: {df_optimal['speedup_vs_avg'].mean():.2f}x")

# Show sample optimal configs
df_optimal[['kernel_name', 'N', 'block_x', 'block_y', 'exec_time', 'speedup_vs_avg']].head(10)

## 6. Create Training Prompts

In [None]:
# Create prompts for all optimal configs
df_optimal['text'] = df_optimal.apply(create_prompt, axis=1)

print(f"Created {len(df_optimal)} training examples")
print("\nExample training prompt:")
print("=" * 60)
print(df_optimal['text'].iloc[0])
print("=" * 60)

## 7. Load Model and Tokenizer

In [None]:
print(f"Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set pad token (GPT-2 doesn't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model (use float32 to avoid FP16 gradient issues)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto" if torch.cuda.is_available() else None
)

print(f"✓ Model loaded: {MODEL_NAME}")
print(f"  Parameters: {model.num_parameters():,}")
print(f"  Device: {next(model.parameters()).device}")

## 8. Prepare Dataset for Training

In [None]:
# Create HuggingFace dataset
dataset = Dataset.from_pandas(df_optimal[['text']])

def tokenize_function(examples):
    """Tokenize texts for causal language modeling"""
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length'
    )
    # For causal LM, labels are the same as input_ids
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

# Tokenize dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

# Split train/val (90/10)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

print(f"✓ Dataset prepared")
print(f"  Training samples: {len(split_dataset['train'])}")
print(f"  Validation samples: {len(split_dataset['test'])}")

## 9. Configure Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_steps=100,
    save_steps=100,
    eval_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),  # Use bfloat16 if supported
    fp16=False,  # Disable fp16 to avoid gradient scaler issues
    push_to_hub=False,
    report_to="none"  # Disable wandb/tensorboard
)

# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=data_collator,
)

print("✓ Trainer configured")

## 10. Train the Model

**Note:** This will take some time depending on your hardware.
- CPU: 1-2 hours
- GPU: 10-30 minutes

In [None]:
print("Starting training...")
print("=" * 60)

# Train
trainer.train()

print("=" * 60)
print("✓ Training complete!")

## 11. Save the Model

In [None]:
# Save final model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✓ Model saved to: {OUTPUT_DIR}")
print(f"  - Model weights: {OUTPUT_DIR}/pytorch_model.bin")
print(f"  - Tokenizer: {OUTPUT_DIR}/tokenizer_config.json")

## 12. Test Prediction Function

In [None]:
def predict_with_llm(model, tokenizer, kernel_info: dict):
    """
    Use fine-tuned LLM to predict block configuration
    
    Args:
        kernel_info: dict with kernel characteristics
        
    Returns:
        (block_x, block_y, total_threads)
    """
    # Create input prompt (without the output part)
    prompt = f"""### Kernel: {kernel_info['kernel_name']}
    N: {kernel_info['N']}
    dimensionality: {kernel_info['dimensionality']}D
    compute_intensity: {kernel_info['compute_intensity']:.2f}
    has_shared_memory: {kernel_info['has_shared_memory']}
    global_reads: {kernel_info['global_reads']}
    global_writes: {kernel_info['global_writes']}
    arithmetic_ops: {kernel_info['arithmetic_ops']}
    memory_ops: {kernel_info['memory_ops']}

    ### Optimal Configuration:
    """
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Get the device the model is on
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.1,  # Low temperature for more deterministic output
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract block dims from generated text
    # Look for pattern like "block_dims: (256, 1, 1)"
    match = re.search(r'block_dims:\s*\((\d+),\s*(\d+),\s*(\d+)\)', generated_text)
    if match:
        block_x = int(match.group(1))
        block_y = int(match.group(2))
        total_threads = block_x * block_y
        return block_x, block_y, total_threads, generated_text
    else:
        # Fallback to default
        print("⚠ Warning: Could not parse LLM output, using default")
        return 256, 1, 256, generated_text

print("✓ Prediction function defined")

## 13. Test Predictions

In [None]:
# Test with a few different kernels
test_cases = [
    {
        'kernel_name': 'matrixMul',
        'N': 2048,
        'dimensionality': 2,
        'compute_intensity': 15.2,
        'has_shared_memory': True,
        'global_reads': 20,
        'global_writes': 10,
        'arithmetic_ops': 304,
        'memory_ops': 30
    },
    {
        'kernel_name': 'vectorAdd',
        'N': 1048576,
        'dimensionality': 1,
        'compute_intensity': 3.5,
        'has_shared_memory': False,
        'global_reads': 2,
        'global_writes': 1,
        'arithmetic_ops': 7,
        'memory_ops': 3
    },
    {
        'kernel_name': 'Convolution2D_kernel',
        'N': 4096,
        'dimensionality': 2,
        'compute_intensity': 8.3,
        'has_shared_memory': True,
        'global_reads': 15,
        'global_writes': 5,
        'arithmetic_ops': 166,
        'memory_ops': 20
    }
]

print("TEST PREDICTIONS")
print("=" * 70)

for test_kernel in test_cases:
    print(f"\nKernel: {test_kernel['kernel_name']}, N={test_kernel['N']}")
    print("-" * 70)
    
    bx, by, total, full_output = predict_with_llm(model, tokenizer, test_kernel)
    
    print(f"Predicted configuration:")
    print(f"  block_dims: ({bx}, {by}, 1)")
    print(f"  total_threads: {total}")
    
    # Show part of the generated text
    output_part = full_output.split("### Optimal Configuration:")[-1][:200]
    print(f"\nGenerated output:")
    print(output_part.strip())

print("\n" + "=" * 70)

## 14. Evaluate on Validation Set (Optional)

In [None]:
# Sample some validation examples and test predictions
val_samples = df_optimal.sample(min(10, len(df_optimal)), random_state=42)

correct = 0
total = 0

print("Validation Set Predictions")
print("=" * 80)
print(f"{'Kernel':<20} {'N':<8} {'Predicted':<20} {'Actual':<20} {'Match'}")
print("=" * 80)

for _, row in val_samples.iterrows():
    kernel_info = {
        'kernel_name': row['kernel_name'],
        'N': int(row['N']),
        'dimensionality': int(row['dimensionality']),
        'compute_intensity': float(row['compute_intensity']),
        'has_shared_memory': bool(row['has_shared_memory']),
        'global_reads': int(row['global_reads']),
        'global_writes': int(row['global_writes']),
        'arithmetic_ops': int(row['arithmetic_ops']),
        'memory_ops': int(row['memory_ops'])
    }
    
    bx_pred, by_pred, total_pred, _ = predict_with_llm(model, tokenizer, kernel_info)
    bx_actual = int(row['block_x'])
    by_actual = int(row['block_y'])
    
    match = "✓" if (bx_pred == bx_actual and by_pred == by_actual) else "✗"
    if match == "✓":
        correct += 1
    total += 1
    
    print(f"{row['kernel_name'][:20]:<20} {int(row['N']):<8} "
          f"({bx_pred:4d}, {by_pred:4d})     "
          f"({bx_actual:4d}, {by_actual:4d})     {match}")

print("=" * 80)
print(f"Accuracy: {correct}/{total} = {100*correct/total:.1f}%")

## 15. Summary

You've successfully:
1. ✓ Loaded and prepared the KLARAPTOR dataset
2. ✓ Found optimal configurations for each kernel/size combination
3. ✓ Created training prompts in text format
4. ✓ Fine-tuned an LLM to predict block configurations
5. ✓ Tested predictions on new kernel configurations

**Next Steps:**
- Try different model sizes (gpt2-medium, phi-2, TinyLlama)
- Experiment with training parameters (epochs, learning rate)
- Add more features to the prompt (GPU architecture, compute capability)
- Integrate with your grid_block_suggester.py for inference