## 1. Import Libraries

**Goal:** Fine-tune an LLM to predict execution times for CUDA kernel configurations

In [None]:
import pandas as pd
import numpy as np
import torch
import re
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset

print(f"PyTorch version: {torch.__version__}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.8.0+cu126
GPU available: True
GPU device: Tesla T4


## 2. Configuration

In [None]:
# Configuration
CSV_FILE = "klaraptor_enriched_data.csv"  # Path to your enriched data
MODEL_NAME = "Qwen/Qwen1.5-0.5B"  # or "gpt2", "microsoft/phi-2", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = "cuda_exec_time_predictor_llm"

# Training parameters
NUM_EPOCHS = 3
BATCH_SIZE = 1
LEARNING_RATE = 5e-5
MAX_LENGTH = 256

# Time filtering (optional - set to None to use all data)
TIME_REGIME = None  # Options: "fast" (<1ms), "medium" (1-100ms), "slow" (>100ms), None (all)

print(f"Configuration:")
print(f"  Data file: {CSV_FILE}")
print(f"  Model: {MODEL_NAME}")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Time regime: {TIME_REGIME if TIME_REGIME else 'All'}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")

Configuration:
  Data file: klaraptor_enriched_data.csv
  Model: Qwen/Qwen1.5-0.5B
  Output directory: cuda_exec_time_predictor_llm
  Time regime: All
  Epochs: 3
  Batch size: 1


## 3. Define Helper Functions

In [None]:
def make_prediction_prompt(kernel_meta: dict, bx: int, by: int, bz: int) -> str:
    """
    Build a completely explicit prompt that instructs the model to return
    exactly one numeric value in **milliseconds** with the suffix ' ms'.
    Example desired output: '12.345 ms'
    (Always 3 decimal places is recommended for consistency).
    """
    lines = [
        f"Kernel: {kernel_meta.get('kernel_name','<kernel>')}",
        f"N: {kernel_meta.get('N', 0)}",
        f"Dimensionality: {kernel_meta.get('dimensionality',1)}D",
        f"Compute intensity: {kernel_meta.get('compute_intensity', 1.0):.2f} FLOPs/byte",
        f"Shared memory: {'yes' if kernel_meta.get('has_shared_memory', False) else 'no'}",
        f"Global reads: {kernel_meta.get('global_reads', 0)}",
        f"Global writes: {kernel_meta.get('global_writes', 0)}",
        f"Arithmetic ops: {kernel_meta.get('arithmetic_ops', 0)}",
        f"Memory ops: {kernel_meta.get('memory_ops', 0)}",
        f"Uses syncthreads: {'yes' if kernel_meta.get('uses_syncthreads', False) else 'no'}",
        "",
        f"Configuration: block_dims=({bx}, {by}, {bz})",
        "",
        "### Predicted Execution Time (in milliseconds):",
        "Return exactly one floating point number followed by ' ms' (example: '12.345 ms').",
        "Do NOT include any extra words, punctuation, or commentary — only the number and ' ms'.",
        ""
    ]
    return "\n".join(lines)


def create_training_prompt(row):
    """
    Create a training prompt with actual execution time as the target
    """
    prompt = make_prediction_prompt(
        kernel_meta={
            'kernel_name': row['kernel_name'],
            'N': int(row['N']),
            'dimensionality': int(row['dimensionality']),
            'compute_intensity': float(row['compute_intensity']),
            'has_shared_memory': bool(row['has_shared_memory']),
            'global_reads': int(row['global_reads']),
            'global_writes': int(row['global_writes']),
            'arithmetic_ops': int(row['arithmetic_ops']),
            'memory_ops': int(row['memory_ops']),
            'uses_syncthreads': bool(row['uses_syncthreads'])
        },
        bx=int(row['bx']),
        by=int(row['by']),
        bz=int(row['bz'])
    )

    # Append the actual answer
    prompt += f"{row['exec_time']:.3f} ms\n"

    return prompt


# Test the prompt function
test_row = {
    'kernel_name': 'Convolution2D_kernel',
    'N': 1024,
    'dimensionality': 2,
    'compute_intensity': 10.5,
    'has_shared_memory': True,
    'global_reads': 15,
    'global_writes': 5,
    'arithmetic_ops': 210,
    'memory_ops': 20,
    'uses_syncthreads': False,
    'bx': 16,
    'by': 16,
    'bz': 1,
    'exec_time': 5.234567
}

print("Example training prompt:")
print("=" * 70)
print(create_training_prompt(test_row))
print("=" * 70)

Example training prompt:
Kernel: Convolution2D_kernel
N: 1024
Dimensionality: 2D
Compute intensity: 10.50 FLOPs/byte
Shared memory: yes
Global reads: 15
Global writes: 5
Arithmetic ops: 210
Memory ops: 20
Uses syncthreads: no

Configuration: block_dims=(16, 16, 1)

### Predicted Execution Time (in milliseconds):
Return exactly one floating point number followed by ' ms' (example: '12.345 ms').
Do NOT include any extra words, punctuation, or commentary — only the number and ' ms'.
5.235 ms



## 4. Load and Prepare Data

In [None]:
# Load the CSV
df = pd.read_csv(CSV_FILE)

print(f"Dataset columns: {list(df.columns)}")

# The CSV already has the correct column names: kernel, bx, by, bz, etc.
# Rename 'kernel' to 'kernel_name' for consistency
df = df.rename(columns={'kernel': 'kernel_name'})

# Apply time filtering if specified
if TIME_REGIME == "fast":
    df = df[df['exec_time'] < 1.0]
    print(f"Filtering for FAST regime: exec_time < 1ms")
elif TIME_REGIME == "medium":
    df = df[(df['exec_time'] >= 1.0) & (df['exec_time'] < 100.0)]
    print(f"Filtering for MEDIUM regime: 1ms ≤ exec_time < 100ms")
elif TIME_REGIME == "slow":
    df = df[df['exec_time'] >= 100.0]
    print(f"Filtering for SLOW regime: exec_time ≥ 100ms")

print(f"\nLoaded {len(df):,} total configurations")
print(f"Kernels: {df['kernel_name'].nunique()}")
print(f"Data sizes (N): {sorted(df['N'].unique())}")
print(f"Exec time range: {df['exec_time'].min():.6f} - {df['exec_time'].max():.6f} ms")
print(f"Log exec time range: {np.log(df['exec_time'].min()):.2f} - {np.log(df['exec_time'].max()):.2f}")

if 'gpu' in df.columns:
    print(f"GPUs: {df['gpu'].nunique()} ({', '.join(df['gpu'].unique())})")

# Show sample
print("\nSample data:")
df.head()

Dataset columns: ['kernel', 'N', 'bx', 'by', 'bz', 'total_threads', 'exec_time', 'gpu', 'dimensionality', 'compute_intensity', 'has_shared_memory', 'global_reads', 'global_writes', 'arithmetic_ops', 'memory_ops', 'control_flow_ops', 'loop_ops', 'uses_syncthreads', 'estimated_flops', 'estimated_memory_bytes', 'uses_threadIdx_x', 'uses_threadIdx_y', 'uses_threadIdx_z', 'uses_blockIdx_x', 'uses_blockIdx_y', 'uses_blockIdx_z', 'uses_blockDim_x', 'uses_blockDim_y', 'uses_blockDim_z']

Loaded 15,961 total configurations
Kernels: 29
Data sizes (N): [np.int64(32), np.int64(64), np.int64(128), np.int64(256), np.int64(512), np.int64(1024), np.int64(2048), np.int64(4096), np.int64(8192), np.int64(16384)]
Exec time range: 0.002600 - 211772.500000 ms
Log exec time range: -5.95 - 12.26
GPUs: 3 (geforce_gtx_1080_ti, tesla_c2075, nvidia_geforce_rtx_2070_super)

Sample data:


Unnamed: 0,kernel_name,N,bx,by,bz,total_threads,exec_time,gpu,dimensionality,compute_intensity,...,estimated_memory_bytes,uses_threadIdx_x,uses_threadIdx_y,uses_threadIdx_z,uses_blockIdx_x,uses_blockIdx_y,uses_blockIdx_z,uses_blockDim_x,uses_blockDim_y,uses_blockDim_z
0,Convolution2D_kernel,64,1,32,1,32,0.0129,geforce_gtx_1080_ti,2,4.45,...,40,1,1,0,1,1,0,1,1,0
1,Convolution2D_kernel,64,2,16,1,32,0.0117,geforce_gtx_1080_ti,2,4.45,...,40,1,1,0,1,1,0,1,1,0
2,Convolution2D_kernel,64,4,8,1,32,0.0121,geforce_gtx_1080_ti,2,4.45,...,40,1,1,0,1,1,0,1,1,0
3,Convolution2D_kernel,64,8,4,1,32,0.0114,geforce_gtx_1080_ti,2,4.45,...,40,1,1,0,1,1,0,1,1,0
4,Convolution2D_kernel,64,16,2,1,32,0.0113,geforce_gtx_1080_ti,2,4.45,...,40,1,1,0,1,1,0,1,1,0


## 5. Sample Data for Training

For LLMs, we'll use a subset of data (too much data can be slow). We'll sample diverse configurations.

In [None]:
# For LLM training, we'll use a stratified sample to keep training time reasonable
# Sample proportionally from each kernel
MAX_SAMPLES = 500  # Adjust based on your GPU/CPU capacity

if len(df) > MAX_SAMPLES:
    print(f"Sampling {MAX_SAMPLES} examples from {len(df):,} total...")

    # Stratified sampling: same proportion from each kernel
    df_sampled = df.groupby('kernel_name', group_keys=False).apply(
        lambda x: x.sample(min(len(x), MAX_SAMPLES // df['kernel_name'].nunique()), random_state=42)
    ).reset_index(drop=True)

    print(f"✓ Sampled {len(df_sampled):,} examples")
else:
    df_sampled = df.copy()
    print(f"Using all {len(df_sampled):,} examples")

# Distribution stats
print(f"\nSampled data statistics:")
print(f"  Kernels: {df_sampled['kernel_name'].nunique()}")
print(f"  Configs per kernel: {len(df_sampled) / df_sampled['kernel_name'].nunique():.1f} avg")
print(f"  Exec time range: {df_sampled['exec_time'].min():.6f} - {df_sampled['exec_time'].max():.6f} ms")

# Show distribution by kernel
df_sampled.groupby('kernel_name').agg({
    'exec_time': ['count', 'min', 'max', 'mean']
}).round(4).head(10)

Sampling 500 examples from 15,961 total...
✓ Sampled 493 examples

Sampled data statistics:
  Kernels: 29
  Configs per kernel: 17.0 avg
  Exec time range: 0.003800 - 211772.500000 ms


  df_sampled = df.groupby('kernel_name', group_keys=False).apply(


Unnamed: 0_level_0,exec_time,exec_time,exec_time,exec_time
Unnamed: 0_level_1,count,min,max,mean
kernel_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Convolution2D_kernel,17,0.0094,3.1132,0.2527
atax_kernel1,17,0.0308,9.8771,1.6342
atax_kernel2,17,0.0153,9.2928,1.3277
bicg_kernel1,17,0.0232,8.345,1.2438
bicg_kernel2,17,0.0356,9.334,1.5151
convolution3D_kernel,17,0.094,82.3351,14.0677
corr_kernel,17,0.6901,2405.6177,256.4238
covar_kernel,17,0.7965,1908.731,333.9449
fdtd_step1_kernel,17,1.3748,144.2865,15.5945
fdtd_step2_kernel,17,1.3422,149.082,16.3367


## 6. Create Training Prompts

In [None]:
# Create prompts for all sampled configs
df_sampled['text'] = df_sampled.apply(create_training_prompt, axis=1)

print(f"Created {len(df_sampled)} training examples")
print("\nExample training prompt:")
print("=" * 70)
print(df_sampled['text'].iloc[0])
print("=" * 70)

Created 493 training examples

Example training prompt:
Kernel: Convolution2D_kernel
N: 1024
Dimensionality: 2D
Compute intensity: 4.45 FLOPs/byte
Shared memory: no
Global reads: 9
Global writes: 1
Arithmetic ops: 83
Memory ops: 10
Uses syncthreads: no

Configuration: block_dims=(64, 1, 1)

### Predicted Execution Time (in milliseconds):
Return exactly one floating point number followed by ' ms' (example: '12.345 ms').
Do NOT include any extra words, punctuation, or commentary — only the number and ' ms'.
0.295 ms



In [None]:
print(df_sampled)

              kernel_name     N  bx  by  bz  total_threads  exec_time  \
0    Convolution2D_kernel  1024  64   1   1             64     0.2948   
1    Convolution2D_kernel   256   8   4   1             32     0.0125   
2    Convolution2D_kernel  2048  64   2   1            128     0.1621   
3    Convolution2D_kernel  1024  32   2   1             64     0.0482   
4    Convolution2D_kernel  2048  16  16   1            256     0.1745   
..                    ...   ...  ..  ..  ..            ...        ...   
488           syrk_kernel    32   2  32   1             64     0.0510   
489           syrk_kernel  2048   8   8   1             64   191.0656   
490           syrk_kernel    64   8   4   1             32     0.0905   
491           syrk_kernel   128   2  16   1             32     0.8134   
492           syrk_kernel   128   8  32   1            256     0.0614   

                     gpu  dimensionality  compute_intensity  ...  \
0            tesla_c2075               2              4

## 7. Load Model and Tokenizer

In [None]:
print(f"Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set pad token (GPT-2 doesn't have one by default)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model (use float32 to avoid FP16 gradient issues)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto" if torch.cuda.is_available() else None
)

print(f"✓ Model loaded: {MODEL_NAME}")
print(f"  Parameters: {model.num_parameters():,}")
print(f"  Device: {next(model.parameters()).device}")

Loading model: Qwen/Qwen1.5-0.5B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

✓ Model loaded: Qwen/Qwen1.5-0.5B
  Parameters: 463,987,712
  Device: cuda:0


## 8. Prepare Dataset for Training

In [None]:
# Create HuggingFace dataset
dataset = Dataset.from_pandas(df_sampled[['text']])

def tokenize_function(examples):
    """Tokenize texts for causal language modeling"""
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length'
    )
    # For causal LM, labels are the same as input_ids
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

# Tokenize dataset
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

# Split train/val (90/10)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

print(f"✓ Dataset prepared")
print(f"  Training samples: {len(split_dataset['train'])}")
print(f"  Validation samples: {len(split_dataset['test'])}")
print(f"  Max token length: {MAX_LENGTH}")

Tokenizing dataset...


Map:   0%|          | 0/493 [00:00<?, ? examples/s]

✓ Dataset prepared
  Training samples: 443
  Validation samples: 50
  Max token length: 256


## 9. Configure Training

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=16,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    logging_steps=50,
    eval_steps=100,
    save_steps=100,
    eval_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),  # Use bfloat16 if supported
    fp16=False,  # Disable fp16 to avoid gradient scaler issues
    push_to_hub=False,
    report_to="none"  # Disable wandb/tensorboard
)

# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal LM, not masked LM
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=data_collator,
)

print("✓ Trainer configured")

The model is already on multiple devices. Skipping the move to device specified in `args`.


✓ Trainer configured


In [None]:
torch.cuda.memory_summary()



## 10. Train the Model

**Note:** This will take some time depending on your hardware.
- CPU: 1-2 hours
- GPU: 10-30 minutes

In [None]:
print("Starting training...")
print("=" * 60)

# Train
trainer.train()

print("=" * 60)
print("✓ Training complete!")

Starting training...


Step,Training Loss,Validation Loss


✓ Training complete!


In [None]:
torch.cuda.empty_cache()

## 11. Save the Model

In [None]:
# Save final model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✓ Model saved to: {OUTPUT_DIR}")
print(f"  - Model weights: {OUTPUT_DIR}/pytorch_model.bin")
print(f"  - Tokenizer: {OUTPUT_DIR}/tokenizer_config.json")

✓ Model saved to: cuda_exec_time_predictor_llm
  - Model weights: cuda_exec_time_predictor_llm/pytorch_model.bin
  - Tokenizer: cuda_exec_time_predictor_llm/tokenizer_config.json


## 12. Test Prediction Function

In [None]:
def predict_exec_time_with_llm(model, tokenizer, kernel_info, block_x, block_y, block_z=1):
    """
    Use fine-tuned LLM to predict execution time for a given configuration

    Args:
        model: Fine-tuned LLM model
        tokenizer: Tokenizer
        kernel_info: dict with kernel characteristics
        block_x, block_y, block_z: block dimensions to predict for

    Returns:
        (predicted_exec_time_ms, generated_text)
    """
    # Create input prompt (without the answer part)
    prompt = make_prediction_prompt(kernel_info, block_x, block_y, block_z)

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt")

    # Get the device the model is on
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,  # Only need a few tokens for the number
            temperature=0.1,  # Low temperature for more deterministic output
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract exec_time from generated text
    # Look for pattern like "12.345 ms"
    match = re.search(r'([\d.]+)\s*ms', generated_text)
    if match:
        exec_time = float(match.group(1))
    else:
        print(f"⚠ Warning: Could not parse LLM output: {generated_text[-100:]}")
        # Fallback: rough estimate based on problem size
        exec_time = (kernel_info.get('N', 1024) ** kernel_info.get('dimensionality', 1)) / 1e6

    return exec_time, generated_text


def find_optimal_config_with_llm(model, tokenizer, kernel_info, candidate_configs=None):
    """
    Find optimal block configuration by testing multiple configs with LLM

    Returns:
        (best_block_x, best_block_y, best_block_z, predicted_exec_time, all_predictions)
    """
    # Generate candidate configs
    if candidate_configs is None:
        dim = kernel_info.get('dimensionality', 1)
        if dim == 1:
            candidate_configs = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), (1024, 1, 1)]
        else:
            candidate_configs = [
                (8, 8, 1), (16, 8, 1), (16, 16, 1), (32, 8, 1), (32, 16, 1), (32, 32, 1),
                (64, 4, 1), (64, 8, 1), (64, 16, 1), (128, 4, 1), (128, 8, 1), (256, 4, 1)
            ]

    # Predict for all configs
    predictions = []
    for block_x, block_y, block_z in candidate_configs:
        exec_time, _ = predict_exec_time_with_llm(
            model, tokenizer, kernel_info, block_x, block_y, block_z
        )
        predictions.append({
            'block_x': block_x,
            'block_y': block_y,
            'block_z': block_z,
            'predicted_time': exec_time
        })

    # Find best
    best = min(predictions, key=lambda x: x['predicted_time'])

    return best['block_x'], best['block_y'], best['block_z'], best['predicted_time'], predictions

print("✓ Prediction functions defined")

✓ Prediction functions defined


## 13. Test Predictions

In [None]:
# Test predictions for different kernels and configurations
test_cases = [
    {
        'kernel_name': 'Convolution2D_kernel',
        'N': 1024,
        'dimensionality': 2,
        'compute_intensity': 4.45,
        'has_shared_memory': False,
        'global_reads': 9,
        'global_writes': 1,
        'arithmetic_ops': 83,
        'memory_ops': 10,
        'uses_syncthreads': False
    },
    {
        'kernel_name': 'mm2_kernel1',
        'N': 2048,
        'dimensionality': 2,
        'compute_intensity': 15.2,
        'has_shared_memory': True,
        'global_reads': 20,
        'global_writes': 10,
        'arithmetic_ops': 304,
        'memory_ops': 30,
        'uses_syncthreads': True
    }
]

print("TEST PREDICTIONS - FINDING OPTIMAL CONFIGS")
print("=" * 80)

for test_kernel in test_cases:
    print(f"\nKernel: {test_kernel['kernel_name']}, N={test_kernel['N']}")
    print("-" * 80)

    bx, by, bz, pred_time, all_preds = find_optimal_config_with_llm(
        model, tokenizer, test_kernel
    )

    print(f"✓ Optimal configuration:")
    print(f"    block_dims: ({bx}, {by}, {bz})")
    print(f"    predicted_time: {pred_time:.6f} ms")

    print(f"\n  Top 5 configurations:")
    for pred in sorted(all_preds, key=lambda x: x['predicted_time'])[:5]:
        print(f"    ({pred['block_x']:4d}, {pred['block_y']:4d}, {pred['block_z']:4d}): {pred['predicted_time']:.6f} ms")

print("\n" + "=" * 80)

TEST PREDICTIONS - FINDING OPTIMAL CONFIGS

Kernel: Convolution2D_kernel, N=1024
--------------------------------------------------------------------------------
✓ Optimal configuration:
    block_dims: (8, 8, 1)
    predicted_time: 12.345000 ms

  Top 5 configurations:
    (   8,    8,    1): 12.345000 ms
    (  16,    8,    1): 12.345000 ms
    (  16,   16,    1): 12.345000 ms
    (  32,    8,    1): 12.345000 ms
    (  32,   16,    1): 12.345000 ms

Kernel: mm2_kernel1, N=2048
--------------------------------------------------------------------------------
✓ Optimal configuration:
    block_dims: (8, 8, 1)
    predicted_time: 12.345000 ms

  Top 5 configurations:
    (   8,    8,    1): 12.345000 ms
    (  16,    8,    1): 12.345000 ms
    (  16,   16,    1): 12.345000 ms
    (  32,    8,    1): 12.345000 ms
    (  32,   16,    1): 12.345000 ms



## 14. Evaluate on Validation Set (Optional)

In [None]:
# Evaluate on validation set - compare predicted vs actual execution times
val_samples = df_sampled.sample(min(20, len(df_sampled)), random_state=42)

predictions_list = []
actuals_list = []

print("Validation Set Predictions")
print("=" * 90)
print(f"{'Kernel':<25} {'Config':<15} {'Predicted':<15} {'Actual':<15} {'Error %'}")
print("=" * 90)

for _, row in val_samples.iterrows():
    kernel_info = {
        'kernel_name': row['kernel_name'],
        'N': int(row['N']),
        'dimensionality': int(row['dimensionality']),
        'compute_intensity': float(row['compute_intensity']),
        'has_shared_memory': bool(row['has_shared_memory']),
        'global_reads': int(row['global_reads']),
        'global_writes': int(row['global_writes']),
        'arithmetic_ops': int(row['arithmetic_ops']),
        'memory_ops': int(row['memory_ops']),
        'uses_syncthreads': bool(row['uses_syncthreads'])
    }

    bx = int(row['bx'])
    by = int(row['by'])
    bz = int(row['bz'])

    pred_time, _ = predict_exec_time_with_llm(model, tokenizer, kernel_info, bx, by, bz)
    actual_time = row['exec_time']

    error_pct = abs(pred_time - actual_time) / actual_time * 100

    predictions_list.append(pred_time)
    actuals_list.append(actual_time)

    print(f"{row['kernel_name'][:25]:<25} ({bx:3d},{by:3d},{bz:3d})    "
          f"{pred_time:>10.6f} ms  {actual_time:>10.6f} ms  {error_pct:>6.1f}%")

# Calculate metrics
predictions_arr = np.array(predictions_list)
actuals_arr = np.array(actuals_list)

mae = np.mean(np.abs(predictions_arr - actuals_arr))
mape = np.mean(np.abs((actuals_arr - predictions_arr) / actuals_arr)) * 100
r2 = 1 - np.sum((actuals_arr - predictions_arr)**2) / np.sum((actuals_arr - actuals_arr.mean())**2)

within_10 = (np.abs((actuals_arr - predictions_arr) / actuals_arr) < 0.1).mean() * 100
within_20 = (np.abs((actuals_arr - predictions_arr) / actuals_arr) < 0.2).mean() * 100

print("=" * 90)
print(f"Metrics:")
print(f"  MAE: {mae:.6f} ms")
print(f"  MAPE: {mape:.2f}%")
print(f"  R²: {r2:.4f}")
print(f"  Within 10%: {within_10:.1f}%")
print(f"  Within 20%: {within_20:.1f}%")

Validation Set Predictions
Kernel                    Config          Predicted       Actual          Error %
std_kernel2               (  2, 16,  1)     12.345000 ms    0.054700 ms  22468.6%
bicg_kernel2              (  8,  8,  1)     12.345000 ms    0.035600 ms  34577.0%
mm2_kernel2               ( 16,  8,  1)     12.345000 ms    6.548600 ms    88.5%
fdtd_step3_kernel         ( 32,  1,  1)     12.345000 ms    1.783200 ms   592.3%
mm3_kernel1               (512,  2,  1)     12.345000 ms    0.752600 ms  1540.3%
gesummv_kernel            (128,  4,  1)     12.345000 ms    1.474100 ms   737.5%
fdtd_step2_kernel         ( 16,  8,  1)     12.345000 ms    1.824900 ms   576.5%
bicg_kernel1              (  2,512,  1)     12.345000 ms    8.345000 ms    47.9%
mm2_kernel2               ( 16, 64,  1)     12.345000 ms    0.142700 ms  8551.0%
Convolution2D_kernel      (  8,  8,  1)     12.345000 ms    0.011800 ms  104518.6%
corr_kernel               (  1, 64,  1)     12.345000 ms    9.104600 ms    35

In [None]:
 !zip -r cuda_exec_time_predictor_llm.zip cuda_exec_time_predictor_llm/

  adding: cuda_exec_time_predictor_llm/ (stored 0%)
  adding: cuda_exec_time_predictor_llm/model.safetensors (deflated 7%)
  adding: cuda_exec_time_predictor_llm/tokenizer_config.json (deflated 70%)
  adding: cuda_exec_time_predictor_llm/added_tokens.json (deflated 36%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/ (stored 0%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/model.safetensors (deflated 7%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/training_args.bin (deflated 53%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/tokenizer_config.json (deflated 70%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/rng_state.pth (deflated 27%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/optimizer.pt (deflated 8%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/trainer_state.json (deflated 55%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/added_tokens.json (deflated 36%)
  adding: cuda_exec_time_predictor_llm/checkpoint-84/special_tokens

In [None]:
from google.colab import files
files.download('/content/cuda_exec_time_predictor_llm.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>