In [1]:
import json
import os
import subprocess
import tempfile
from pathlib import Path
from typing import List, Dict, Optional, Tuple
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import requests
import time

try:
    from openai import OpenAI
    OPENAI_AVAILABLE = True
except ImportError:
    print("OpenAI package not installed.")
    OPENAI_AVAILABLE = False

CONFIG = {
    'max_projects': 10,
    'min_examples': 10,
    'max_examples': 25,
    'max_tokens_per_example': 1500,
    'train_split': 0.8,
    'target_model': 'gpt-4o-mini-2024-07-18',
    'output_dir': 'o3_fine_tuning_outputs',
    'cbench_path': 'data/CRUST_bench/CBench',
    'rbench_path': 'data/CRUST_bench/RBench'
}

# Create output directory
Path(CONFIG['output_dir']).mkdir(exist_ok=True)

In [2]:
def setup_crust_bench():
    data_dir = Path("data/CRUST-bench")
    zip_url = "https://github.com/anirudhkhatry/CRUST-bench/raw/main/datasets/CRUST_bench.zip"
    zip_path = data_dir / "CRUST_bench.zip"

    if not data_dir.exists():
        print("Creating data directory...")
        data_dir.mkdir(parents=True, exist_ok=True)

    if not zip_path.exists():
        print("Downloading CRUST_bench.zip...")
        response = requests.get(zip_url, stream=True)
        if response.status_code == 200:
            with open(zip_path, "wb") as f:
                f.write(response.content)
            print("Download complete.")
        else:
            raise Exception(f"Failed to download dataset: HTTP {response.status_code}")

    print("Extracting ZIP contents...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(data_dir)

    print(f"Dataset extracted to {data_dir}")
    return data_dir

# Setup dataset
try:
    DATASET_PATH = setup_crust_bench()
    CBENCH_PATH = DATASET_PATH / "CRUST_bench" / "CBench"
    RBENCH_PATH = DATASET_PATH / "CRUST_bench" / "RBench"
    print(f"C code path: {CBENCH_PATH}")
    print(f"Rust code path: {RBENCH_PATH}")
except Exception as e:
    print(f"Could not setup dataset: {e}")
    CBENCH_PATH = Path("CBench")
    RBENCH_PATH = Path("RBench")


Extracting ZIP contents...
Dataset extracted to data\CRUST-bench
C code path: data\CRUST-bench\CRUST_bench\CBench
Rust code path: data\CRUST-bench\CRUST_bench\RBench


In [3]:
def calculate_example_quality(c_code, rust_code):
    """
    Score example quality for O3 selection
    Higher scores = better training examples for code translation
    """
    score = 0
    
    c_lower = c_code.lower()
    rust_lower = rust_code.lower()
    
    # Memory management patterns (high value for translation)
    if any(pattern in c_lower for pattern in ['malloc', 'free', 'realloc', 'calloc']):
        score += 3
    
    # Good Rust safety patterns
    if any(pattern in rust_lower for pattern in ['vec<', 'box<', 'result<', 'option<']):
        score += 2
    
    # Idiomatic Rust structures
    if any(pattern in rust_lower for pattern in ['impl', 'pub fn', 'struct', 'enum']):
        score += 2
    
    # Appropriate complexity (not too simple, not too complex)
    c_lines = c_code.count('\n') + 1
    rust_lines = rust_code.count('\n') + 1
    if 20 <= c_lines <= 100 and 10 <= rust_lines <= 80:
        score += 2
    
    # Error handling patterns
    if any(pattern in rust_lower for pattern in ['?', '.unwrap', '.expect', 'match']):
        score += 1
    
    # Documentation (bonus)
    if '///' in rust_code or '/*' in rust_code:
        score += 1
    
    # C-Rust interface compatibility
    if 'extern "c"' in rust_lower or '#[no_mangle]' in rust_lower:
        score += 1

    if 'unsafe' in rust_lower:
        score -= 1
    if '.unwrap()' in rust_lower and not '// safe unwrap' in rust_lower:
        score -= 0.5

    return score

def load_crust_bench_minimal():
    """
    Load minimal high-quality examples from CRUST-bench
    Optimized for O3 cost constraints
    """
    
    cbench_path = Path(CONFIG['cbench_path'])
    rbench_path = Path(CONFIG['rbench_path'])
    
    print(f"Loading CRUST-bench with extreme selectivity...")
    
    # Verify paths
    if not cbench_path.exists() or not rbench_path.exists():
        print(f"CRUST-bench dataset not found!")
        print(f"Expected structure:")
        print(f"  {cbench_path}")
        print(f"  {rbench_path}")
        return create_synthetic_examples()  # Fallback for demo
    
    # Get project intersection
    c_projects = [d.name for d in cbench_path.iterdir() if d.is_dir()]
    r_projects = [d.name for d in rbench_path.iterdir() if d.is_dir()]
    common_projects = set(c_projects) & set(r_projects)
    
    print(f"Dataset overview:")
    print(f"   C projects: {len(c_projects)}")
    print(f"   Rust projects: {len(r_projects)}")
    print(f"   Common projects: {len(common_projects)}")
    
    # Priority projects for quality
    priority_projects = ['CircularBuffer', 'amp', 'FastHamming', 'ted', 'totp']
    selected_projects = [p for p in priority_projects if p in common_projects]
    
    # Fill remaining slots if needed
    remaining = list(common_projects - set(selected_projects))
    selected_projects.extend(remaining[:CONFIG['max_projects'] - len(selected_projects)])
    
    print(f"Selected projects: {selected_projects[:CONFIG['max_projects']]}")
    
    candidates = []
    
    # Process each project
    for project_name in selected_projects[:CONFIG['max_projects']]:
        c_project_dir = cbench_path / project_name
        r_project_dir = rbench_path / project_name
        
        # Load C files with size filtering
        c_files = {}
        for c_file in c_project_dir.rglob("*.c"):
            try:
                with open(c_file, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read().strip()
                    lines = content.count('\n') + 1
                    # Strict filtering for O3
                    if 20 <= lines <= 150 and len(content) > 200:
                        rel_path = c_file.relative_to(c_project_dir)
                        c_files[str(rel_path)] = content
            except:
                continue
        
        # Load Rust interface files
        interfaces_dir = r_project_dir / "src" / "interfaces"
        rust_files = {}
        
        if interfaces_dir.exists():
            for rust_file in interfaces_dir.rglob("*.rs"):
                try:
                    with open(rust_file, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read().strip()
                        lines = content.count('\n') + 1
                        if 10 <= lines <= 100 and len(content) > 100:
                            rel_path = rust_file.relative_to(interfaces_dir)
                            rust_files[str(rel_path)] = content
                except:
                    continue
        
        # Create and score candidates
        for c_path, c_content in c_files.items():
            for rust_path, rust_content in rust_files.items():
                estimated_tokens = len(c_content + rust_content) // 4
                
                if estimated_tokens <= CONFIG['max_tokens_per_example']:
                    quality_score = calculate_example_quality(c_content, rust_content)
                    
                    candidates.append({
                        'project': project_name,
                        'c_file': c_path,
                        'rust_file': rust_path,
                        'c_code': c_content,
                        'rust_code': rust_content,
                        'c_lines': c_content.count('\n') + 1,
                        'rust_lines': rust_content.count('\n') + 1,
                        'estimated_tokens': estimated_tokens,
                        'quality_score': quality_score
                    })
    
    # Select best examples by quality
    candidates.sort(key=lambda x: x['quality_score'], reverse=True)
    selected = candidates[:CONFIG['max_examples']]
    
    print(f"Candidate examples: {len(candidates)}")
    print(f"Selected examples: {len(selected)}")
    
    if selected:
        print(f"Quality range: {min(ex['quality_score'] for ex in selected):.1f} - {max(ex['quality_score'] for ex in selected):.1f}")
    
    return selected

def create_synthetic_examples():
    """
    Create synthetic examples if CRUST-bench not available (for demo purposes)
    """
    print("📝 Creating synthetic examples for demonstration...")
    
    synthetic_examples = [
        {
            'project': 'SyntheticCircularBuffer',
            'c_file': 'circular_buffer.c',
            'rust_file': 'circular_buffer.rs',
            'c_code': '''#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

typedef struct {
    int* buffer;
    size_t size;
    size_t head;
    size_t tail;
    bool full;
} CircularBuffer;

CircularBuffer* cb_create(size_t size) {
    CircularBuffer* cb = malloc(sizeof(CircularBuffer));
    if (!cb) return NULL;
    
    cb->buffer = malloc(size * sizeof(int));
    if (!cb->buffer) {
        free(cb);
        return NULL;
    }
    
    cb->size = size;
    cb->head = 0;
    cb->tail = 0;
    cb->full = false;
    
    return cb;
}

bool cb_write(CircularBuffer* cb, int data) {
    if (!cb) return false;
    
    cb->buffer[cb->head] = data;
    
    if (cb->full) {
        cb->tail = (cb->tail + 1) % cb->size;
    }
    
    cb->head = (cb->head + 1) % cb->size;
    cb->full = (cb->head == cb->tail);
    
    return true;
}

void cb_destroy(CircularBuffer* cb) {
    if (cb) {
        free(cb->buffer);
        free(cb);
    }
}''',
            'rust_code': '''use std::vec::Vec;

/// A circular buffer implementation in safe Rust
pub struct CircularBuffer {
    buffer: Vec<i32>,
    head: usize,
    tail: usize,
    full: bool,
}

impl CircularBuffer {
    /// Creates a new circular buffer with the specified capacity
    pub fn new(size: usize) -> Result<Self, &'static str> {
        if size == 0 {
            return Err("Size must be greater than 0");
        }
        
        Ok(CircularBuffer {
            buffer: vec![0; size],
            head: 0,
            tail: 0,
            full: false,
        })
    }
    
    /// Writes data to the buffer
    pub fn write(&mut self, data: i32) -> bool {
        self.buffer[self.head] = data;
        
        if self.full {
            self.tail = (self.tail + 1) % self.buffer.len();
        }
        
        self.head = (self.head + 1) % self.buffer.len();
        self.full = self.head == self.tail;
        
        true
    }
    
    /// Returns the size of the buffer
    pub fn size(&self) -> usize {
        self.buffer.len()
    }
}

impl Drop for CircularBuffer {
    fn drop(&mut self) {
        // Rust automatically handles deallocation
    }
}''',
            'c_lines': 47,
            'rust_lines': 44,
            'estimated_tokens': 875,
            'quality_score': 8.0
        }
    ]
    
    return synthetic_examples

# Load the dataset
print("\nLoading dataset...")
dataset = load_crust_bench_minimal()

if not dataset:
    print("No examples found, using synthetic data for demo")
    dataset = create_synthetic_examples()

print(f"\nSELECTED EXAMPLES FOR O3:")
print("-" * 50)
for i, example in enumerate(dataset, 1):
    print(f"{i}. Project: {example['project']}")
    print(f"   Files: {example['c_file']} → {example['rust_file']}")
    print(f"   Size: {example['c_lines']} C lines → {example['rust_lines']} Rust lines")
    print(f"   Tokens: {example['estimated_tokens']}")
    print(f"   Quality: {example['quality_score']:.1f}")



Loading dataset...
Loading CRUST-bench with extreme selectivity...
Dataset overview:
   C projects: 100
   Rust projects: 100
   Common projects: 80
Selected projects: ['CircularBuffer', 'amp', 'FastHamming', 'ted', 'totp', 'tisp', 'satc', 'razz_simulation', 'NandC', 'rubiksolver']
Candidate examples: 37
Selected examples: 25
Quality range: 4.0 - 10.0

SELECTED EXAMPLES FOR O3:
--------------------------------------------------
1. Project: rubiksolver
   Files: rubik_model_tests.c → hash.rs
   Size: 68 C lines → 29 Rust lines
   Tokens: 908
   Quality: 10.0
2. Project: rubiksolver
   Files: rubik_model_tests.c → heap.rs
   Size: 68 C lines → 30 Rust lines
   Tokens: 835
   Quality: 10.0
3. Project: rubiksolver
   Files: hash\hash_tests.c → hash.rs
   Size: 34 C lines → 29 Rust lines
   Tokens: 572
   Quality: 10.0
4. Project: rubiksolver
   Files: hash\hash_tests.c → heap.rs
   Size: 34 C lines → 30 Rust lines
   Tokens: 500
   Quality: 10.0
5. Project: rubiksolver
   Files: heap\heap

In [6]:
def display_high_quality_examples():
    """
    Display the highest quality C-to-Rust translation examples
    """
    
    print("HIGH-QUALITY TRANSLATION EXAMPLES")
    print("=" * 60)
    
    if not dataset:
        print("No dataset available. Using synthetic example for demonstration.")
        return
    
    # Sort examples by quality score (highest first)
    sorted_examples = sorted(dataset, key=lambda x: x['quality_score'], reverse=True)
    
    # Display top 3 examples
    for i, example in enumerate(sorted_examples[:3], 1):
        print(f"\nEXAMPLE {i}: {example['project'].upper()}")
        print(f"   Quality Score: {example['quality_score']:.1f}/10")
        print(f"   Size: {example['c_lines']} C lines → {example['rust_lines']} Rust lines")
        print(f"   File: {example['c_file']} → {example['rust_file']}")
        print(f"   Tokens: ~{example['estimated_tokens']}")
        
        print(f"\nC CODE:")
        print("─" * 40)
        print(example['c_code'][:500] + "..." if len(example['c_code']) > 500 else example['c_code'])
        
        print(f"\nRUST CODE:")
        print("─" * 40)
        print(example['rust_code'][:500] + "..." if len(example['rust_code']) > 500 else example['rust_code'])
        
        # Analyze what makes this example high quality
        print(f"\nQUALITY INDICATORS:")
        c_lower = example['c_code'].lower()
        rust_lower = example['rust_code'].lower()
        
        indicators = []
        if any(pattern in c_lower for pattern in ['malloc', 'free', 'realloc', 'calloc']):
            indicators.append("• Memory management patterns (malloc/free)")
        if any(pattern in rust_lower for pattern in ['vec<', 'box<', 'result<', 'option<']):
            indicators.append("• Safe Rust types (Vec, Box, Result, Option)")
        if any(pattern in rust_lower for pattern in ['impl', 'pub fn', 'struct', 'enum']):
            indicators.append("• Idiomatic Rust structures")
        if any(pattern in rust_lower for pattern in ['?', '.unwrap', '.expect', 'match']):
            indicators.append("• Proper error handling")
        if 'unsafe' not in rust_lower:
            indicators.append("• Memory-safe implementation")
        if '///' in example['rust_code'] or '/*' in example['rust_code']:
            indicators.append("• Documentation comments")
        
        for indicator in indicators:
            print(f"   {indicator}")
        
        if i < 3:
            print("\n" + "="*60)

def compare_translation_approaches():
    """
    Compare different translation approaches on the same C code
    """
    
    print("\n\nTRANSLATION APPROACH COMPARISON")
    print("=" * 60)
    
    if not dataset:
        print("No dataset available for comparison.")
        return
    
    # Take the highest quality example
    best_example = max(dataset, key=lambda x: x['quality_score'])
    
    print(f"SOURCE: {best_example['project']} - {best_example['c_file']}")
    print(f"ORIGINAL C CODE:")
    print("─" * 40)
    print(best_example['c_code'])
    
    print(f"\nEXPERT RUST TRANSLATION:")
    print("─" * 40)
    print(best_example['rust_code'])
    
    print(f"\nWHAT A FINE-TUNED MODEL SHOULD LEARN:")
    print("─" * 40)
    
    # Analyze key transformations
    c_code = best_example['c_code']
    rust_code = best_example['rust_code']
    
    transformations = []
    
    if 'malloc' in c_code.lower():
        transformations.append("• malloc/free → Vec<T> or Box<T> (automatic memory management)")
    if 'NULL' in c_code:
        transformations.append("• NULL checks → Option<T> (type-safe nullability)")
    if 'struct' in c_code.lower():
        transformations.append("• C structs → Rust structs with ownership semantics")
    if any(word in rust_code.lower() for word in ['pub fn', 'impl']):
        transformations.append("• C functions → Rust methods with proper visibility")
    if 'Result<' in rust_code:
        transformations.append("• Error codes → Result<T, E> (explicit error handling)")
    
    for transformation in transformations:
        print(f"   {transformation}")

def show_quality_distribution():
    """
    Show the distribution of quality scores across all examples
    """
    
    print("\n\nQUALITY SCORE DISTRIBUTION")
    print("=" * 60)
    
    if not dataset:
        print("No dataset available for analysis.")
        return
    
    scores = [ex['quality_score'] for ex in dataset]
    
    print(f"STATISTICS:")
    print(f"   Total Examples: {len(scores)}")
    print(f"   Average Quality: {np.mean(scores):.2f}/10")
    print(f"   Highest Quality: {max(scores):.1f}/10")
    print(f"   Lowest Quality: {min(scores):.1f}/10")
    print(f"   Standard Deviation: {np.std(scores):.2f}")
    
    # Create quality bins
    bins = [0, 2, 4, 6, 8, 10]
    labels = ['Poor (0-2)', 'Below Avg (2-4)', 'Average (4-6)', 'Good (6-8)', 'Excellent (8-10)']
    
    print(f"\nQUALITY DISTRIBUTION:")
    for i in range(len(bins)-1):
        count = sum(1 for score in scores if bins[i] <= score < bins[i+1])
        percentage = (count / len(scores)) * 100
        bar = "█" * max(1, int(percentage / 5))
        print(f"   {labels[i]:<15} │{bar:<20} {count:2d} ({percentage:4.1f}%)")
    
    print(f"\nINSIGHT:")
    high_quality_count = sum(1 for score in scores if score >= 6)
    if high_quality_count > 0:
        print(f"   {high_quality_count} examples ({(high_quality_count/len(scores)*100):.1f}%) are high-quality (6+)")
        print(f"   These form the core of our training dataset")
    else:
        print(f"   Limited high-quality examples - would benefit from data augmentation")

# Execute all example displays
display_high_quality_examples()
compare_translation_approaches()
show_quality_distribution()

HIGH-QUALITY TRANSLATION EXAMPLES

EXAMPLE 1: RUBIKSOLVER
   Quality Score: 10.0/10
   Size: 68 C lines → 29 Rust lines
   File: rubik_model_tests.c → hash.rs
   Tokens: ~908

C CODE:
────────────────────────────────────────
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include "rubik_model.h"

int main()
{
  printf("Running rubik_model tests\n");
  
  printf("Testing REAR...\n");
  assert_verbose(REAR(RED) == ORANGE);
  assert_verbose(REAR(GREEN) == YELLOW);
  assert_verbose(REAR(BLUE) == WHITE);
  assert_verbose(REAR(REAR(RED)) == RED);
  assert_verbose(REAR(REAR(GREEN)) == GREEN);
  assert_verbose(REAR(REAR(BLUE)) == BLUE);
  
  // Call function to test private functions(adjacent_cw, adjacent_ccw,....

RUST CODE:
────────────────────────────────────────
/// A simple hash table using a vector of buckets.
/// The table uses a provided hash function (which returns a u32)
/// and a caller‑supplied equality function for comparing elements.
pub struct Hash<T> {
    buckets:

In [7]:
def format_for_o3_instruction_tuning(examples):
    """
    Enhanced formatting for O3's reasoning capabilities
    Optimized for step-by-step thinking and explicit reasoning
    """
    
    print(f"Formatting {len(examples)} examples for O3 reasoning model...")
    
    # Enhanced system prompt for O3's reasoning
    system_prompt = """You are an expert C-to-Rust translator with deep understanding of systems programming and memory safety.

REASONING APPROACH:
1. First analyze the C code structure and identify key patterns
2. Plan the Rust translation strategy step-by-step
3. Consider safety implications and choose appropriate Rust types
4. Implement with clear explanations of design decisions

EXPERTISE AREAS:
- Memory management: malloc/free → Vec<T>, Box<T>, Rc<T>, Arc<T>
- Error handling: NULL checks → Result<T, E>, Option<T>
- Ownership: Manual memory management → Rust ownership system
- Safety: Buffer overflows → bounds checking, safe indexing
- Concurrency: Raw pointers → safe abstractions (Mutex, RwLock)
- API design: C functions → Rust methods, traits, and modules

TRANSLATION PRINCIPLES:
- Maintain functional equivalence
- Leverage Rust's type system for safety
- Use idiomatic Rust patterns
- Provide clear documentation
- Handle edge cases explicitly"""

    formatted_examples = []
    
    for example in examples:
        # O3-optimized user prompt with reasoning structure
        user_prompt = f"""Translate this C code to safe, idiomatic Rust using step-by-step reasoning:

**CONTEXT:**
- Project: {example['project']}
- Source: {example['c_file']} → {example['rust_file']}
- Complexity: {example['c_lines']} lines of C code

**C CODE TO TRANSLATE:**
```c
{example['c_code']}
```

**TRANSLATION REQUIREMENTS:**
1. Maintain all original functionality
2. Use Rust's memory safety features (no unsafe blocks unless absolutely necessary)
3. Follow Rust naming conventions (snake_case for functions/variables)
4. Implement proper error handling with Result<T, E> or Option<T>
5. Add appropriate documentation comments
6. Ensure thread safety where applicable

**PLEASE PROVIDE:**
1. **Analysis**: Key patterns and challenges in the C code
2. **Strategy**: Rust types and patterns you'll use
3. **Implementation**: Final Rust code with comments"""

        # Enhanced assistant response format for O3
        rust_analysis = analyze_c_code_patterns(example['c_code'])
        rust_strategy = plan_rust_translation(example['c_code'])
        
        assistant_response = f"""**ANALYSIS:**
{rust_analysis}

**STRATEGY:**
{rust_strategy}

**IMPLEMENTATION:**
```rust
{example['rust_code']}
```

**KEY IMPROVEMENTS:**
- Eliminated manual memory management
- Added proper error handling
- Ensured memory safety through Rust's type system
- Maintained API compatibility while improving safety"""

        formatted_example = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": assistant_response}
            ]
        }
        
        formatted_examples.append(formatted_example)
    
    return formatted_examples

def analyze_c_code_patterns(c_code):
    """Analyze C code to identify key patterns for O3 reasoning"""
    patterns = []
    
    if 'malloc' in c_code or 'free' in c_code:
        patterns.append("Manual memory allocation/deallocation detected")
    if 'NULL' in c_code:
        patterns.append("NULL pointer checks present - candidate for Option<T>")
    if 'struct' in c_code:
        patterns.append("C structs - will become Rust structs with ownership")
    if 'typedef' in c_code:
        patterns.append("Type definitions - will use Rust type aliases or newtypes")
    if '*' in c_code and 'char' in c_code:
        patterns.append("C strings detected - consider String or &str")
    
    return "\\n".join(f"- {pattern}" for pattern in patterns) if patterns else "- Standard C code patterns"

def plan_rust_translation(c_code):
    """Plan Rust translation strategy for O3 reasoning"""
    strategies = []
    
    if 'malloc' in c_code:
        strategies.append("Replace malloc/free with Vec<T> or Box<T> for automatic memory management")
    if 'NULL' in c_code:
        strategies.append("Use Option<T> for nullable pointers and Result<T, E> for error handling")
    if 'struct' in c_code:
        strategies.append("Convert C structs to Rust structs with proper ownership semantics")
    if 'typedef' in c_code:
        strategies.append("Use Rust type aliases or newtype patterns for type safety")
    
    strategies.append("Add comprehensive error handling")
    strategies.append("Ensure thread safety where applicable")
    strategies.append("Use Rust naming conventions (snake_case)")
    
    return "\\n".join(f"- {strategy}" for strategy in strategies)


# Format examples
formatted_data = format_for_o3_instruction_tuning(dataset)

# Split into train/validation
if len(formatted_data) >= 2:
    split_idx = max(1, int(len(formatted_data) * CONFIG['train_split']))
    train_data = formatted_data[:split_idx]
    val_data = formatted_data[split_idx:]
else:
    train_data = formatted_data
    val_data = []

print(f"\nDataset split for O3:")
print(f"   Training examples: {len(train_data)}")
print(f"   Validation examples: {len(val_data)}")

# Save formatted data
def save_jsonl(data, filepath):
    """Save data in JSONL format"""
    with open(filepath, 'w', encoding='utf-8') as f:
        for example in data:
            f.write(json.dumps(example, ensure_ascii=False) + '\n')
    print(f"Saved {len(data)} examples to {filepath}")

train_file = f"{CONFIG['output_dir']}/train_o3_crust.jsonl"
val_file = f"{CONFIG['output_dir']}/val_o3_crust.jsonl"

save_jsonl(train_data, train_file)
if val_data:
    save_jsonl(val_data, val_file)

# Display example
print(f"\nExample formatted for O3:")
if train_data:
    example = train_data[0]
    print(f"System: {example['messages'][0]['content'][:150]}...")
    print(f"User: {example['messages'][1]['content'][:200]}...")
    print(f"Assistant: {example['messages'][2]['content'][:200]}...")


Formatting 25 examples for O3 reasoning model...

Dataset split for O3:
   Training examples: 20
   Validation examples: 5
Saved 20 examples to finetune_outputs/train_o3_crust.jsonl
Saved 5 examples to finetune_outputs/val_o3_crust.jsonl

Example formatted for O3:
System: You are an expert C-to-Rust translator with deep understanding of systems programming and memory safety.

REASONING APPROACH:
1. First analyze the C c...
User: Translate this C code to safe, idiomatic Rust using step-by-step reasoning:

**CONTEXT:**
- Project: rubiksolver
- Source: rubik_model_tests.c → hash.rs
- Complexity: 68 lines of C code

**C CODE TO T...
Assistant: **ANALYSIS:**
- Manual memory allocation/deallocation detected

**STRATEGY:**
- Add comprehensive error handling\n- Ensure thread safety where applicable\n- Use Rust naming conventions (snake_case)

*...


In [8]:
def analyze_o3_costs():
    """
    Detailed cost analysis for O3 fine-tuning
    """
    
    print(f"O3 COST ANALYSIS")
    print("=" * 40)
    
    # Rough token calculation
    total_tokens = 0
    for example in formatted_data:
        for message in example['messages']:
            total_tokens += len(message['content']) // 4
    
    print(f"Token breakdown:")
    print(f"   Total examples: {len(formatted_data)}")
    print(f"   Total tokens: {total_tokens:,}")
    print(f"   Average tokens per example: {total_tokens // len(formatted_data) if formatted_data else 0:,}")
    
    # O3 cost estimates
    cost_scenarios = {
        'o3-mini (optimistic)': {'training': 0.50, 'inference': 0.15},
        'o3-mini (realistic)': {'training': 2.00, 'inference': 0.50}, 
        'o3 (expensive)': {'training': 10.00, 'inference': 2.00}
    }
    
    print(f"\nCost estimates per 1K tokens:")
    print(f"{'Model':<20} {'Training':<12} {'Inference':<12} {'Total (3 epochs)':<15}")
    print("-" * 65)
    
    for model, costs in cost_scenarios.items():
        training_cost_total = (total_tokens * 3 * costs['training']) / 1000
        inference_cost = costs['inference']
        
        print(f"{model:<20} ${costs['training']:<11.2f} ${inference_cost:<11.2f} ${training_cost_total:<14.2f}")
    
    # Recommendation
    estimated_cost = (total_tokens * 3 * 2.00) / 1000
    
    print(f"\nRECOMMENDED CONFIGURATION:")
    print(f"   Model: o3-mini")
    print(f"   Examples: {len(formatted_data)}")
    print(f"   Epochs: 1 (cost control)")
    print(f"   Estimated cost: ${estimated_cost/3:.2f}")
    
    return estimated_cost

estimated_cost = analyze_o3_costs()

def create_o3_job_config():
    """
    Create O3 fine-tuning job configuration
    """
    
    print(f"\nO3 FINE-TUNING CONFIGURATION")
    print("=" * 40)
    
    # Conservative config for O3
    job_config = {
        'model': CONFIG['target_model'],
        'training_file': train_file,
        'validation_file': val_file if val_data else None,
        'hyperparameters': {
            'n_epochs': 1,
            'learning_rate_multiplier': 0.1,
            'batch_size': 1
        },
        'suffix': f'crust-o3-{int(time.time())}',
        'cost_controls': {
            'max_cost_limit': 50.0,
            'early_stopping': True,
            'monitor_cost': True
        }
    }
    
    print(f"Configuration:")
    for key, value in job_config.items():
        if isinstance(value, dict):
            print(f"   {key}:")
            for subkey, subvalue in value.items():
                print(f"     {subkey}: {subvalue}")
        else:
            print(f"   {key}: {value}")
    
    return job_config

o3_config = create_o3_job_config()

O3 COST ANALYSIS
Token breakdown:
   Total examples: 25
   Total tokens: 32,184
   Average tokens per example: 1,287

Cost estimates per 1K tokens:
Model                Training     Inference    Total (3 epochs)
-----------------------------------------------------------------
o3-mini (optimistic) $0.50        $0.15        $48.28         
o3-mini (realistic)  $2.00        $0.50        $193.10        
o3 (expensive)       $10.00       $2.00        $965.52        

RECOMMENDED CONFIGURATION:
   Model: o3-mini
   Examples: 25
   Epochs: 1 (cost control)
   Estimated cost: $64.37

O3 FINE-TUNING CONFIGURATION
Configuration:
   model: gpt-4o-mini-2024-07-18
   training_file: finetune_outputs/train_o3_crust.jsonl
   validation_file: finetune_outputs/val_o3_crust.jsonl
   hyperparameters:
     n_epochs: 1
     learning_rate_multiplier: 0.1
     batch_size: 1
   suffix: crust-o3-1751062164
   cost_controls:
     max_cost_limit: 50.0
     early_stopping: True
     monitor_cost: True


In [28]:
def create_working_job_config(train_file, val_file=None, output_dir="./"):
    """
    Create a WORKING fine-tuning job configuration that actually works with OpenAI API
    """
    
    print(f"\nWORKING FINE-TUNING CONFIGURATION")
    print("=" * 45)
    
    available_models = [
        'gpt-4o-mini-2024-07-18',
        'gpt-4o-2024-08-06',         # Full GPT-4o (expensive)
        'gpt-3.5-turbo-0125',        # Legacy fallback
        'gpt-3.5-turbo-1106',        # Older legacy
    ]
    
    target_model = 'gpt-4o-mini-2024-07-18'

    timestamp = str(int(time.time()))[-6:]
    
    # VALID OpenAI API configuration
    job_config = {
        'model': target_model,
        'training_file': train_file,
        'validation_file': val_file if val_file else None,
        'output_dir': output_dir,
        'hyperparameters': {
            'n_epochs': 3,
            'learning_rate_multiplier': 2.0,
            'batch_size': 'auto'
        },
        'suffix': f'crust-{timestamp}'
    }
    
    print(f"✓ Using REAL model: {target_model}")
    print(f"✓ Valid hyperparameters only")
    print(f"✓ Compatible with OpenAI API")
    print(f"✓ Output directory: {output_dir}")
    
    print(f"\nConfiguration:")
    for key, value in job_config.items():
        if isinstance(value, dict):
            print(f"   {key}:")
            for subkey, subvalue in value.items():
                print(f"     {subkey}: {subvalue}")
        else:
            print(f"   {key}: {value}")
    
    return job_config

def execute_real_fine_tuning(config, mock_mode=True):
    """
    Execute REAL fine-tuning job that actually works with OpenAI
    """
    
    if mock_mode:
        print(f"\nMOCK EXECUTION (Demo Mode)")
        print("=" * 40)
        print("This shows what WOULD happen with a real API call")
        
        mock_job = {
            'id': 'ftjob-working-demo-12345',
            'object': 'fine_tuning.job',
            'model': config['model'],
            'created_at': int(time.time()),
            'training_file': config['training_file'],
            'validation_file': config.get('validation_file'),
            'hyperparameters': config['hyperparameters'],
            'suffix': config['suffix'],
            'status': 'validating_files'
        }
        
        print(f"Mock job created: {mock_job['id']}")
        print(f"Status: {mock_job['status']}")
        return mock_job
    
    else:
        print(f"\nREAL API EXECUTION")
        print("=" * 30)
        
        if not OPENAI_AVAILABLE:
            print("OpenAI package not available")
            return None
        
        try:
            client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
            
            # Upload training file and get file ID
            print("Uploading training file...")
            with open(config['training_file'], 'rb') as f:
                train_file_obj = client.files.create(file=f, purpose='fine-tune')
            
            print(f"Training file uploaded: {train_file_obj.id}")
            
            # Upload validation file if provided
            val_file_id = None
            if config.get('validation_file'):
                print("Uploading validation file...")
                with open(config['validation_file'], 'rb') as f:
                    val_file_obj = client.files.create(file=f, purpose='fine-tune')
                val_file_id = val_file_obj.id
                print(f"Validation file uploaded: {val_file_id}")
            
            print("Creating fine-tuning job...")
            
            job_params = {
                'training_file': train_file_obj.id,
                'model': config['model'],
                'hyperparameters': config['hyperparameters'],
                'suffix': config['suffix']
            }
            
            if val_file_id:
                job_params['validation_file'] = val_file_id
            
            job = client.fine_tuning.jobs.create(**job_params)
            
            print(f"Fine-tuning job created successfully!")
            print(f"   Job ID: {job.id}")
            print(f"   Status: {job.status}")
            print(f"   Model: {job.model}")
            
            # Save job info with proper error handling
            job_info = {
                'job_id': job.id,
                'created_at': job.created_at,
                'model': job.model,
                'training_file_id': job.training_file,
                'validation_file_id': getattr(job, 'validation_file', None),
                'status': job.status,
                'hyperparameters': job.hyperparameters,
                'suffix': config['suffix'],
                'original_config': config
            }
            
            # Save with error handling
            try:
                output_dir = config.get('output_dir', './')
                os.makedirs(output_dir, exist_ok=True)  # Create directory if needed
                output_file = os.path.join(output_dir, "real_job_info.json")
                
                with open(output_file, 'w') as f:
                    json.dump(job_info, f, indent=2, default=str)
                
                print(f"Job info saved to: {output_file}")
            except Exception as save_error:
                print(f"Job created successfully but couldn't save info: {save_error}")
                print(f"   Job ID: {job.id} (save this!)")
            
            return job
            
        except Exception as e:
            print(f"Error creating job: {e}")
            print("Common issues:")
            print("- Invalid API key")
            print("- Insufficient account tier (need Tier 1+ for GPT-4o mini)")
            print("- File not properly formatted (must be JSONL)")
            print("- Invalid model name")
            print("- Suffix too long (max 18 chars)")
            return None

print("Creating working fine-tuning configuration...")

train_file = "o3_fine_tuning_outputs/train_o3_crust.jsonl"
val_file = "o3_fine_tuning_outputs/val_o3_crust.jsonl"
output_dir = "o3_fine_tuning_outputs"

working_config = create_working_job_config(train_file, val_file, output_dir)
result = execute_real_fine_tuning(working_config, mock_mode=False)

Creating working fine-tuning configuration...

WORKING FINE-TUNING CONFIGURATION
✓ Using REAL model: gpt-4o-mini-2024-07-18
✓ Valid hyperparameters only
✓ Compatible with OpenAI API
✓ Output directory: o3_fine_tuning_outputs

Configuration:
   model: gpt-4o-mini-2024-07-18
   training_file: o3_fine_tuning_outputs/train_o3_crust.jsonl
   validation_file: o3_fine_tuning_outputs/val_o3_crust.jsonl
   output_dir: o3_fine_tuning_outputs
   hyperparameters:
     n_epochs: 3
     learning_rate_multiplier: 2.0
     batch_size: auto
   suffix: crust-917256

REAL API EXECUTION
Uploading training file...
Training file uploaded: file-WUUuiGUNEak78aD7WQARMy
Uploading validation file...
Validation file uploaded: file-7emUVJEFwhnVN2rZoJ8NZJ
Creating fine-tuning job...
Fine-tuning job created successfully!
   Job ID: ftjob-r5JVflLgD5nLf10rqsnpe0Pu
   Status: validating_files
   Model: gpt-4o-mini-2024-07-18
Job info saved to: o3_fine_tuning_outputs\real_job_info.json


In [9]:
def monitor_job_status(job):
    """Check fine-tuning job status and return model ID if complete"""
    
    job_id = job.id if hasattr(job, 'id') else job.get('id')
    if not job_id:
        return None
    
    try:
        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
        current_job = client.fine_tuning.jobs.retrieve(job_id)
        
        print(f"Job {job_id}: {current_job.status}")
        
        if current_job.status == 'succeeded':
            print(f"Model ready: {current_job.fine_tuned_model}")
            return current_job.fine_tuned_model
        elif current_job.status == 'failed':
            print(f"Job failed: {getattr(current_job, 'error', 'Unknown error')}")
            return None
        else:
            return 'in_progress'
            
    except Exception as e:
        print(f"Error checking status: {e}")
        return None

def test_model(model_id, test_examples, max_tests=3):
    """Test fine-tuned model with C-to-Rust translation"""
    
    if not model_id or model_id == 'in_progress':
        print("Model not ready")
        return []
    
    print(f"\nTesting model: {model_id}")
    
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    results = []
    
    for i, example in enumerate(test_examples[:max_tests], 1):
        print(f"\n--- Test {i}: {example.get('project', 'Unknown')} ---")
        
        c_code = example.get('c_code', '')
        print(f"C Code: {c_code[:100]}...")
        
        prompt = f"""Translate this C code to safe, idiomatic Rust:

```c
{c_code}
```"""
        
        try:
            response = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": "You are an expert C-to-Rust translator."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1500,
                temperature=0.1
            )
            
            generated_rust = response.choices[0].message.content
            print(f"Generated: {generated_rust[:100]}...")
            
            # Quick evaluation
            quality = evaluate_quality(generated_rust)
            compiles = check_compilation(generated_rust)
            
            print(f"Quality: {quality:.2f}, Compiles: {compiles}")
            
            results.append({
                'project': example.get('project'),
                'quality': quality,
                'compiles': compiles,
                'generated_rust': generated_rust
            })
            
        except Exception as e:
            print(f"Error: {e}")
            results.append({'project': example.get('project'), 'error': str(e)})
    
    # Summary
    if results:
        valid = [r for r in results if 'quality' in r]
        if valid:
            avg_quality = sum(r['quality'] for r in valid) / len(valid)
            compile_rate = sum(r['compiles'] for r in valid) / len(valid)
            
            print(f"\nSummary: Quality {avg_quality:.2f}/1.0, Compiles {compile_rate:.1%}")
            
            if avg_quality >= 0.6:
                print("Excellent performance!")
            elif avg_quality >= 0.4:
                print("Good results")
            else:
                print("Needs improvement")
    
    return results

def evaluate_quality(rust_code):
    """Quick quality score for Rust code"""
    
    rust_lower = rust_code.lower()
    score = 0
    
    # Positive patterns
    good_patterns = ['vec<', 'result<', 'option<', '?', 'impl', 'pub fn', 'use std']
    for pattern in good_patterns:
        if pattern in rust_lower:
            score += 0.15
    
    # Negative patterns
    bad_patterns = ['unsafe', '.unwrap()', 'panic!', '*mut', 'malloc']
    for pattern in bad_patterns:
        if pattern in rust_lower:
            score -= 0.2
    
    return max(0, min(1, score))

def check_compilation(rust_code):
    """Check if Rust code compiles"""
    
    try:
        with tempfile.NamedTemporaryFile(mode='w', suffix='.rs', delete=False) as f:
            if 'fn main(' not in rust_code:
                full_code = f"{rust_code}\n\nfn main() {{}}\n"
            else:
                full_code = rust_code
            
            f.write(full_code)
            temp_file = f.name
        
        result = subprocess.run(
            ['rustc', '--crate-type', 'bin', temp_file, '-o', '/dev/null'],
            capture_output=True,
            timeout=10
        )
        
        os.unlink(temp_file)
        return result.returncode == 0
        
    except:
        return False

def evaluate_model(job, dataset):
    """Complete evaluation: check job → test model → results"""
    
    print("Fine-tuned Model Evaluation")
    print("=" * 35)
    
    # Check job status
    model_id = monitor_job_status(job)
    
    if model_id == 'in_progress':
        print("Job still running. Try again later.")
        return None
    
    if not model_id:
        print("Job failed or not found")
        return None
    
    # Test the model
    test_examples = dataset[-3:] if len(dataset) >= 3 else dataset
    results = test_model(model_id, test_examples)
    
    # Save results
    report = {
        'model_id': model_id,
        'timestamp': time.time(),
        'results': results
    }
    
    with open('evaluation_results.json', 'w') as f:
        json.dump(report, f, indent=2, default=str)
    
    print(f"Results saved to evaluation_results.json")
    
    return report

print("Evaluating your fine-tuning job...")

try:
    report = evaluate_model(result, dataset)
    
    if report:
        print(f"\nEvaluation complete!")
        print(f"Model: {report['model_id']}")
    else:
        print(f"\nCheck again later when job completes")
        
except NameError:
    print("Need 'result' from execute_real_fine_tuning() and 'dataset'")
    print("Usage: evaluate_model(your_job_result, your_dataset)")

Evaluating your fine-tuning job...
Need 'result' from execute_real_fine_tuning() and 'dataset'
Usage: evaluate_model(your_job_result, your_dataset)
