In [None]:
import json
import re

# 要删除（屏蔽）的标记词集合，可根据需要继续扩展
CUE_WORDS = {
    "in fact", "so", "but", "therefore", "thus", "however",
    "moreover", "on the other hand", "in contrast", "as a result",
    "consequently", "hence", "either", "or", "only if"
}

# 构造正则表达式，匹配这些词（忽略大小写，词边界）
PATTERN = re.compile(r'\b(' + '|'.join(re.escape(w) for w in CUE_WORDS) + r')\b', flags=re.IGNORECASE)

def mask_text(text, placeholder=""):
    if text is None:
        return text
    new_text = PATTERN.sub(placeholder, text)
    # 合并多余空格
    new_text = re.sub(r'\s{2,}', ' ', new_text).strip()
    return new_text

def process_jsonl_file(input_path, output_path, fields_to_mask=("Arg1", "Arg2")):
    affected_count = 0  # 记录受影响的样本数量
    with open(input_path, 'r', encoding='utf-8') as fin, open(output_path, 'w', encoding='utf-8') as fout:
        for line in fin:
            if not line.strip():
                continue
            rec = json.loads(line)
            affected = False  # 标记当前记录是否受影响
            for field in fields_to_mask:
                if field in rec and isinstance(rec[field], str):
                    original_text = rec[field]
                    modified_text = mask_text(original_text)
                    if original_text != modified_text:
                        affected = True  # 如果文本修改了，标记为受影响
                        rec[field] = modified_text
            if affected:
                affected_count += 1  # 如果当前记录受影响，增加计数
            fout.write(json.dumps(rec, ensure_ascii=False) + '\n')
    print(f"Processed {input_path} → {output_path}")
    print(f"Total affected samples: {affected_count}")

if __name__ == "__main__":
    input_file = "./pdtb3_train_level1_T5.jsonl"
    output_file = "./pdtb3_train_level1_T5_mask.jsonl"
    process_jsonl_file(input_file, output_file, fields_to_mask=("Arg1", "Arg2"))
    


Processed ./pdtb3_train_level1_T5.jsonl → ./pdtb3_train_level1_T5_mask.jsonl
Total affected samples: 3985


In [None]:
import json
import os

# 设定一级标签对应选项字母
label_to_letter = {
    "Comparison": "A",
    "Contingency": "B",
    "Expansion":   "C",
    "Temporal":    "D"
}

def convert_to_prompt_format(input_path, output_path):
    count = 0
    with open(input_path, 'r', encoding='utf-8') as fin, \
         open(output_path, 'w', encoding='utf-8') as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line in {input_path}: {e}")
                continue

            arg1 = obj.get("Arg1", "").replace("\n", " ")
            arg2 = obj.get("Arg2", "").replace("\n", " ")
            label = obj.get("Label", "")

            # 如果一级标签不在映射里，跳过
            if label not in label_to_letter:
                # 可选择跳过或映射为 “UNKNOWN”
                continue

            letter = label_to_letter[label]

            prompt_obj = {
                "instruction": "What is the discourse relation between Arg 1 and Arg 2?. Only output one letter: A, B, C, or D.",
                "input": f"Arg1: {arg1}\nArg2: {arg2}\n\nA. Comparison\nB. Contingency\nC. Expansion\nD. Temporal",
                "output": letter
            }

            fout.write(json.dumps(prompt_obj, ensure_ascii=False) + "\n")
            count += 1

    print(f"Converted {input_path} → {output_path}, total lines: {count}")

if __name__ == "__main__":
    files = [
        ("pdtb3_train_level1.jsonl", "pdtb3_train_instruction.jsonl"),
        ("pdtb3_dev_level1.jsonl",   "pdtb3_dev_instruction.jsonl"),
        ("pdtb3_test_level1.jsonl",  "pdtb3_test_instruction.jsonl")
    ]
    for inp, outp in files:
        if os.path.exists(inp):
            convert_to_prompt_format(inp, outp)
        else:
            print(f"File not found: {inp}")
