In [1]:
import pandas as pd
df = pd.read_parquet("cargo_test_passed_train.parquet")
print(df.columns)

Index(['task_id', 'rust_prompt', 'rust_code', 'rust_test_list'], dtype='object')


In [2]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8001/v1",
    api_key="not-needed"
)

def get_model_response(prompt: str) -> str:
    try:
        response = client.chat.completions.create(
            model="Unsloth/Llama-3.3-70B-Instruct",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        return (
            response.choices[0].message.content.strip()
            if response.choices and response.choices[0].message and response.choices[0].message.content
            else "[No response returned]"
        )
    except Exception as e:
        return f"[Error: {e}]"

In [3]:
translate_to_R = """The following prompt may have Rust syntax, variable types, and Rust-specific behavior mentioned. \
Translate the question to the appropriate R syntax and replace anything mentioning Rust with R.

Do NOT use or mention:
* Complex type annotations
* Overly specific type constraints
* Advanced R metaprogramming
* External packages
Make sure to use:
* Simple types: integer, numeric, character, vector, list, etc.
Only return the translated question and do not answer the question.

Rust Prompt:
{prompt}

R Prompt:"""

In [4]:
start_idx = 0
end_idx = 2000
results = []

for i, row in df.iloc[start_idx:end_idx].iterrows():
    task_id = row["task_id"]
    rust_prompt = row["rust_prompt"]

    final_prompt = translate_to_R.format(prompt=rust_prompt)
    r_prompt = get_model_response(final_prompt)

    results.append({
        "task_id": task_id,
        "r_prompt": r_prompt
    })
    
    print(f"✅ Processed index {i} | Task ID: {task_id}")

output_df = pd.DataFrame(results)
output_df.to_parquet("translated_r_prompts.parquet", index=False)
print(f"✅ Successfully saved {len(output_df)} translated prompts to 'translated_julia_prompts.parquet'")

✅ Processed index 0 | Task ID: task_0
✅ Processed index 1 | Task ID: task_1
✅ Processed index 2 | Task ID: task_2
✅ Processed index 3 | Task ID: task_3
✅ Processed index 4 | Task ID: task_4
✅ Processed index 5 | Task ID: task_8
✅ Processed index 6 | Task ID: task_9
✅ Processed index 7 | Task ID: task_10
✅ Processed index 8 | Task ID: task_11
✅ Processed index 9 | Task ID: task_12
✅ Processed index 10 | Task ID: task_14
✅ Processed index 11 | Task ID: task_15
✅ Processed index 12 | Task ID: task_16
✅ Processed index 13 | Task ID: task_17
✅ Processed index 14 | Task ID: task_18
✅ Processed index 15 | Task ID: task_19
✅ Processed index 16 | Task ID: task_20
✅ Processed index 17 | Task ID: task_21
✅ Processed index 18 | Task ID: task_22
✅ Processed index 19 | Task ID: task_23
✅ Processed index 20 | Task ID: task_24
✅ Processed index 21 | Task ID: task_25
✅ Processed index 22 | Task ID: task_26
✅ Processed index 23 | Task ID: task_27
✅ Processed index 24 | Task ID: task_28
✅ Processed index

In [5]:
df = pd.read_parquet("translated_r_prompts.parquet")

In [6]:
R_code_prompt = """You are a pragmatic R programmer. 
Given the following question, write an R function to complete the task. 
Make the code simple and easy to understand. 
The code should be syntactically correct and follow R best practices and should run. 
Try to limit library usage to base R. 
Do not wrap the function in additional scaffolding. Do not add a main function. Respond with only the R function and nothing else.
Question:
{R_prompt}
Code:"""

In [7]:
start_idx = 0
end_idx = 2000

df["r_code"] = ""

for i, row in df.iloc[start_idx:end_idx].iterrows():
    r_prompt = row["r_prompt"]
    task_id = row["task_id"]

    final_prompt = R_code_prompt.format(R_prompt=r_prompt)
    r_code = get_model_response(final_prompt)

    df.at[i, "r_code"] = r_code

    print(f"✅ Processed index {i} | Task ID: {task_id}")

df.to_parquet("with_r_code.parquet", index=False)
print("💾 Saved updated file with r_code column.")


✅ Processed index 0 | Task ID: task_0
✅ Processed index 1 | Task ID: task_1
✅ Processed index 2 | Task ID: task_2
✅ Processed index 3 | Task ID: task_3
✅ Processed index 4 | Task ID: task_4
✅ Processed index 5 | Task ID: task_8
✅ Processed index 6 | Task ID: task_9
✅ Processed index 7 | Task ID: task_10
✅ Processed index 8 | Task ID: task_11
✅ Processed index 9 | Task ID: task_12
✅ Processed index 10 | Task ID: task_14
✅ Processed index 11 | Task ID: task_15
✅ Processed index 12 | Task ID: task_16
✅ Processed index 13 | Task ID: task_17
✅ Processed index 14 | Task ID: task_18
✅ Processed index 15 | Task ID: task_19
✅ Processed index 16 | Task ID: task_20
✅ Processed index 17 | Task ID: task_21
✅ Processed index 18 | Task ID: task_22
✅ Processed index 19 | Task ID: task_23
✅ Processed index 20 | Task ID: task_24
✅ Processed index 21 | Task ID: task_25
✅ Processed index 22 | Task ID: task_26
✅ Processed index 23 | Task ID: task_27
✅ Processed index 24 | Task ID: task_28
✅ Processed index

In [8]:
R_tests_prompt = """You are a pragmatic R programmer. Given the following question and R function, write three unit tests for the function. The tests should be a simple line delimited list of test_that statements using the testthat library.
For example, if the function is:
```R
add_nums <- function(x, y) {{
  x + y
}}
```
The unit tests should be
```R
library(testthat)
test_that("add_nums works correctly", {{
  expect_equal(add_nums(1, 2), 3)
  expect_equal(add_nums(10, 2), 12)
  expect_equal(add_nums(-10, 2), -8)
}})
```
Make the tests simple and easy to understand. The code should be syntactically correct and run without errors. Do not add any other code. Respond with only the test statements (including the `library(testthat)` line) and nothing else.
============
Question:
{R_prompt}
Code:
{R_code}
Unit Tests:"""

In [9]:
df = pd.read_parquet("with_r_code.parquet")

In [10]:
start_idx = 0
end_idx = 2000

df["r_test"] = ""

for i, row in df.iloc[start_idx:end_idx].iterrows():
    r_prompt = row["r_prompt"]
    r_code = row["r_code"]
    task_id = row["task_id"]

    final_prompt = R_tests_prompt.format(
        R_prompt=r_prompt,
        R_code=r_code
    )

    r_test = get_model_response(final_prompt)

    df.at[i, "r_test"] = r_test

    print(f"✅ Processed index {i} | Task ID: {task_id}")

df.to_parquet("r_set_.parquet", index=False)
print("💾 Saved updated file with julia_test column.")


✅ Processed index 0 | Task ID: task_0
✅ Processed index 1 | Task ID: task_1
✅ Processed index 2 | Task ID: task_2
✅ Processed index 3 | Task ID: task_3
✅ Processed index 4 | Task ID: task_4
✅ Processed index 5 | Task ID: task_8
✅ Processed index 6 | Task ID: task_9
✅ Processed index 7 | Task ID: task_10
✅ Processed index 8 | Task ID: task_11
✅ Processed index 9 | Task ID: task_12
✅ Processed index 10 | Task ID: task_14
✅ Processed index 11 | Task ID: task_15
✅ Processed index 12 | Task ID: task_16
✅ Processed index 13 | Task ID: task_17
✅ Processed index 14 | Task ID: task_18
✅ Processed index 15 | Task ID: task_19
✅ Processed index 16 | Task ID: task_20
✅ Processed index 17 | Task ID: task_21
✅ Processed index 18 | Task ID: task_22
✅ Processed index 19 | Task ID: task_23
✅ Processed index 20 | Task ID: task_24
✅ Processed index 21 | Task ID: task_25
✅ Processed index 22 | Task ID: task_26
✅ Processed index 23 | Task ID: task_27
✅ Processed index 24 | Task ID: task_28
✅ Processed index

In [14]:
import pandas as pd
import re

# Read the parquet file
df = pd.read_parquet('r_set_.parquet')

def clean_code_block(text):
    """Remove markdown code fences from text"""
    if pd.isna(text):
        return text
    
    # Remove ```julia at the start (with optional whitespace)
    text = re.sub(r'^```r\s*\n?', '', text, flags=re.IGNORECASE)
    
    # Remove ``` at the end (with optional whitespace)
    text = re.sub(r'\n?```\s*$', '', text)
    
    return text

# Clean julia_code column
df['r_code'] = df['r_code'].apply(clean_code_block)
df['r_test'] = df['r_test'].apply(clean_code_block)

# Save the cleaned dataframe
df.to_parquet('r_dataset.parquet', index=False)


print("Cleaning complete!")
print(f"Processed {len(df)} rows")

print("\nSample julia_test after processing:")
print(df['r_test'].iloc[0] if len(df) > 0 else "No data")

Cleaning complete!
Processed 2000 rows

Sample julia_test after processing:
library(testthat)
test_that("echo_nums returns correct sequence when x is less than y", {
  expect_equal(echo_nums(1, 3), c(1, 2, 3))
})
test_that("echo_nums returns correct sequence when x is equal to y", {
  expect_equal(echo_nums(5, 5), c(5))
})
test_that("echo_nums returns empty vector when x is greater than y", {
  expect_equal(echo_nums(10, 5), numeric(0))
})
