In [None]:
import pandas as pd
import random
from utils import save_json

random.seed(42)

In [9]:
train_data = pd.read_csv(
    "cola_data/in_domain_train.tsv",
    sep="\t",
    header=None,
    names=["source", "label", "first_label", "text"],
)
test_data = pd.read_csv(
    "cola_data/in_domain_dev.tsv",
    sep="\t",
    header=None,
    names=["source", "label", "first_label", "text"],
)
train_data = train_data[["text", "label"]]
test_data = test_data[["text", "label"]]

In [10]:
prompt = """
Decide whether the following sentence is grammatically acceptable or not. If it is grammatically correct, answer "acceptable". If not, answer "unacceptable". Only output "acceptable" or "unacceptable", and do not output any other information.

Sentence: {sentence}

Your answer:
"""

In [11]:
# 避免推理能力丧失
train_data["instruction"] = train_data["text"].apply(
    lambda x: prompt.format(sentence=x).strip() + "/no_think"
)
train_data["output"] = train_data["label"].apply(
    lambda x: "<think>\n\n</think>\n\nunacceptable"
    if x == 0
    else "<think>\n\n</think>\n\nacceptable"
)
train_data = train_data[["instruction", "output"]]

test_data["instruction"] = test_data["text"].apply(
    lambda x: prompt.format(sentence=x).strip() + "/no_think"
)
test_data["output"] = test_data["label"].apply(
    lambda x: "<think>\n\n</think>\n\nunacceptable"
    if x == 0
    else "<think>\n\n</think>\n\nacceptable"
)
test_data = test_data[["instruction", "output"]]

print(train_data.shape, test_data.shape)

(8551, 2) (527, 2)


In [12]:
train_data = train_data.to_dict(orient="records")
test_data = test_data.to_dict(orient="records")

print(train_data[0])

{'instruction': 'Decide whether the following sentence is grammatically acceptable or not. If it is grammatically correct, answer "acceptable". If not, answer "unacceptable". Only output "acceptable" or "unacceptable", and do not output any other information.\n\nSentence: Our friends won\'t buy this analysis, let alone the next one we propose.\n\nYour answer:/no_think', 'output': '<think>\n\n</think>\n\nacceptable'}


In [13]:
save_json("cola_train.json", train_data)
save_json("cola_test.json", test_data)