In [2]:
import pandas as pd
import re

# Read the parquet file
df = pd.read_parquet('baseline_julia.parquet')

def clean_code_block(text):
    """Remove markdown code fences from text"""
    if pd.isna(text):
        return text
    
    # Remove ```julia at the start (with optional whitespace)
    text = re.sub(r'^```julia\s*\n?', '', text, flags=re.IGNORECASE)
    
    # Remove ``` at the end (with optional whitespace)
    text = re.sub(r'\n?```\s*$', '', text)
    
    return text

def wrap_testset(text):
    """Add @testset wrapper to julia test code"""
    if pd.isna(text):
        return text
    
    # Clean the code first
    text = clean_code_block(text)
    
    # Check if it already has @testset (to avoid double wrapping)
    if '@testset' in text:
        return text
    
    # Split at "using Test" to insert @testset after it
    if 'using Test' in text:
        parts = text.split('using Test', 1)
        wrapped = f'{parts[0]}using Test\n\n@testset "my test" begin\n{parts[1].strip()}'
    else:
        # If no "using Test", just wrap everything
        wrapped = f'@testset "my test" begin\n{text}\nend'
    
    return wrapped

# Clean julia_code column
df['julia_code'] = df['julia_code'].apply(clean_code_block)

# Clean and wrap julia_test column
#df['julia_test'] = df['julia_test'].apply(wrap_testset)

# Save the cleaned dataframe
df.to_parquet('cleaned_file.parquet', index=False)

# Or save as CSV if you prefer
# df.to_csv('cleaned_file.csv', index=False)

print("Cleaning complete!")
print(f"Processed {len(df)} rows")

# Display a sample to verify
print("\nSample julia_test after processing:")
print(df['julia_test'].iloc[0] if len(df) > 0 else "No data")

Cleaning complete!
Processed 1247 rows

Sample julia_test after processing:
using Test

@testset "my test" begin
@test echo_nums(1, 5) == [1, 2, 3, 4, 5]
@test echo_nums(10, 1) == Int[]
@test echo_nums(5, 5) == [5]
end


In [3]:
import pandas as pd

# Read the cleaned parquet file
df = pd.read_parquet('cleaned_file.parquet')

# Function to print a row nicely
def print_row(row_idx):
    print(f"\n{'='*80}")
    print(f"ROW {row_idx}")
    print(f"{'='*80}")
    
    row = df.iloc[row_idx]
    
    print(f"\n📝 task_id: {row['task_id']}")
    
    print(f"\n📋 julia_prompt:")
    print("-" * 80)
    print(row['julia_prompt'])
    
    print(f"\n💻 julia_code:")
    print("-" * 80)
    print(row['julia_code'])
    
    print(f"\n🧪 julia_test:")
    print("-" * 80)
    print(row['julia_test'])
    print()

# Print first 5 rows
num_rows = min(5, len(df))
print(f"Total rows in dataset: {len(df)}")
print(f"Showing first {num_rows} rows:\n")

for i in range(num_rows):
    print_row(i)

print(f"\n{'='*80}")
print("Preview complete!")
print(f"{'='*80}")

Total rows in dataset: 1247
Showing first 5 rows:


ROW 0

📝 task_id: task_0

📋 julia_prompt:
--------------------------------------------------------------------------------
Implement a function `echo_nums(x, y)` that takes two integers, `x` and `y`, and returns a vector of all numerical values within the range from `x` to `y`, inclusive. The function should handle cases where `x` is greater than `y` by returning an empty vector.

💻 julia_code:
--------------------------------------------------------------------------------
function echo_nums(x, y)
    if x > y
        return []
    else
        return collect(x:y)
    end
end

🧪 julia_test:
--------------------------------------------------------------------------------
using Test

@testset "my test" begin
@test echo_nums(1, 5) == [1, 2, 3, 4, 5]
@test echo_nums(10, 1) == Int[]
@test echo_nums(5, 5) == [5]
end


ROW 1

📝 task_id: task_2

📋 julia_prompt:
--------------------------------------------------------------------------------
