In [11]:
import pandas as pd
import random
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import Dataset
from typing import List, Dict

random.seed(42)

In [12]:
train_data = pd.read_csv(
    "cola_data/in_domain_train.tsv",
    sep="\t",
    header=None,
    names=["source", "label", "first_label", "text"],
)
test_data = pd.read_csv(
    "cola_data/in_domain_dev.tsv",
    sep="\t",
    header=None,
    names=["source", "label", "first_label", "text"],
)
train_data = train_data[["text", "label"]]
test_data = test_data[["text", "label"]]

In [13]:
prompt = """
Decide whether the following sentence is grammatically acceptable or not. If it is grammatically correct, answer "acceptable". If not, answer "unacceptable". Only output "acceptable" or "unacceptable", and do not output any other information.

Sentence: {sentence}

Your answer:
"""

In [14]:
train_data["instruction"] = train_data["text"].apply(
    lambda x: prompt.format(sentence=x).strip()
)
train_data["output"] = train_data["label"].apply(
    lambda x: "unacceptable" if x == 0 else "acceptable"
)

test_data["instruction"] = test_data["text"].apply(
    lambda x: prompt.format(sentence=x).strip()
)
test_data["output"] = test_data["label"].apply(
    lambda x: "unacceptable" if x == 0 else "acceptable"
)

print(train_data.shape, test_data.shape)

(8551, 4) (527, 4)


In [15]:
train_data = train_data.to_dict(orient="records")
test_data = test_data.to_dict(orient="records")

print(train_data[0]["instruction"])

Decide whether the following sentence is grammatically acceptable or not. If it is grammatically correct, answer "acceptable". If not, answer "unacceptable". Only output "acceptable" or "unacceptable", and do not output any other information.

Sentence: Our friends won't buy this analysis, let alone the next one we propose.

Your answer:


In [None]:
tokenizer = AutoTokenizer.from_pretrained("model/Qwen3-0.6B")


def get_rl_data(data_source: str, data: List[Dict[str, str]]):
    rl_data = []
    for d in tqdm(data):
        rl_data.append(
            {
                "data_source": data_source,
                "prompt": [{"content": d["instruction"], "role": "user"}],
                "reward_model": {"ground_truth": d["output"]},
            }
        )
    return Dataset.from_list(rl_data, split="train")

In [17]:
ds_train = get_rl_data("cola", train_data)
ds_test = get_rl_data("cola", test_data)
print(ds_train, ds_test)

100%|██████████| 8551/8551 [00:00<00:00, 998426.97it/s]
100%|██████████| 527/527 [00:00<00:00, 796539.89it/s]

Dataset({
    features: ['data_source', 'prompt', 'reward_model'],
    num_rows: 8551
}) Dataset({
    features: ['data_source', 'prompt', 'reward_model'],
    num_rows: 527
})





In [18]:
ds_train[0]

{'data_source': 'cola',
 'prompt': [{'content': 'Decide whether the following sentence is grammatically acceptable or not. If it is grammatically correct, answer "acceptable". If not, answer "unacceptable". Only output "acceptable" or "unacceptable", and do not output any other information.\n\nSentence: Our friends won\'t buy this analysis, let alone the next one we propose.\n\nYour answer:',
   'role': 'user'}],
 'reward_model': {'ground_truth': 'acceptable'}}

In [19]:
ds_train.to_parquet("data/cola_rl/train.parquet")
ds_test.to_parquet("data/cola_rl/test.parquet")

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

182568