In [1]:
import pandas as pd
import random
import spacy

random.seed(42)
nlp = spacy.load("en_core_web_sm")

def build_span_mask_and_target_with_multiple_masks(text, base_mask_ratio=0.18):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [(token.text, token.tag_) for token in doc]

    n = len(tokens)
    if n < 4:
        return None, None, None, None

    if n < 8:
        mask_ratio = 0.10
    elif n > 20:
        mask_ratio = min(0.25, base_mask_ratio + 0.05)
    else:
        mask_ratio = base_mask_ratio

    num_mask = max(1, int(mask_ratio * n))

    important_idxs = [
        i for i, (_, p) in enumerate(pos_tags)
        if p.startswith(('NN', 'VB', 'JJ', 'RB'))
    ]

    all_idxs = list(range(n))
    random.shuffle(all_idxs)

    masked = set()
    spans = []
    sentinel_id = 0

    while len(masked) < num_mask and all_idxs:
        start = next((i for i in all_idxs if i not in masked and i in important_idxs), None)
        if start is None:
            break

        span_len = random.randint(1, min(3, n - start))
        span = list(range(start, start + span_len))
        if any(i in masked or i >= n for i in span):
            all_idxs = [i for i in all_idxs if i not in span]
            continue

        spans.append((sentinel_id, start, span_len))
        masked.update(span)
        sentinel_id += 1
        all_idxs = [i for i in all_idxs if i not in span]

    if not spans:
        return None, None, None, None

    # Masked Title và Target Text chuẩn BART
    masked_out = []
    last = 0
    for sid, start, span_len in sorted(spans, key=lambda x: x[1]):
        masked_out.extend(tokens[last:start])
        masked_out.append(f"<extra_id_{sid}>")
        last = start + span_len
    masked_out.extend(tokens[last:])
    masked_title = " ".join(masked_out)

    target_out = []
    for sid, start, span_len in sorted(spans, key=lambda x: x[1]):
        target_out.append(f"<extra_id_{sid}>")
    target_out.append(f"<extra_id_{len(spans)}>")  # kết thúc
    target_text = " ".join(target_out)

    mask_before_list = []
    mask_after_list = []
    for sid, start, span_len in sorted(spans, key=lambda x: x[1]):
        mask_before_input = ["<mask>"] + tokens[start + span_len:]
        mask_before_target = tokens[:start + span_len]
        mask_after_input = tokens[:start] + ["<mask>"]
        mask_after_target = tokens[start:]
        mask_before_list.append((" ".join(mask_before_input), " ".join(mask_before_target)))
        mask_after_list.append((" ".join(mask_after_input), " ".join(mask_after_target)))

    return masked_title, target_text, mask_before_list, mask_after_list

import re

def is_only_mask_tokens(text):
    tokens = text.strip().split()
    return all(re.fullmatch(r"<extra_id_\d+>", t) for t in tokens)

def process_kaggle_data(train_csv_path, valid_csv_path):
    for split, path in [('train', train_csv_path), ('valid', valid_csv_path)]:
        df = pd.read_csv(path)


        denoise_samples = []

        for idx, row in df.iterrows():
            title = str(row['title']).strip()
            text = str(row['text']).strip()

            masked_title, target_text, mask_before_list, mask_after_list = build_span_mask_and_target_with_multiple_masks(title)
            if masked_title and target_text:
                denoise_samples.append({
                    "content": f"DENOISE: <title> {masked_title} <body> {text}",
                    "title": target_text
                })
                for mask_inp, mask_tgt in mask_before_list:
                    denoise_samples.append({
                        "content": f"DENOISE: <title> {mask_inp} <body> {text}",
                        "title": mask_tgt
                    })
                for mask_inp, mask_tgt in mask_after_list:
                    denoise_samples.append({
                        "content": f"DENOISE: <title> {mask_inp} <body> {text}",
                        "title": mask_tgt
                    })

        # Lọc bỏ các mẫu chỉ có mask tokens, giữ mẫu có ít nhất 1 từ thật
        denoise_samples_filtered = [sample for sample in denoise_samples if not is_only_mask_tokens(sample["title"])]

        pd.DataFrame(denoise_samples_filtered).to_csv(f"/kaggle/working/{split}_denoise_v2_full.csv", index=False, encoding='utf-8')
        print(f"Saved {split}_denoise2.csv ({len(denoise_samples_filtered)} DENOISE samples)")




if __name__ == "__main__":
    train_csv_path = "/kaggle/input/dataset-merged/train_merged_data.csv"
    valid_csv_path = "/kaggle/input/dataset-merged/valid_merged_data.csv"
    process_kaggle_data(train_csv_path, valid_csv_path)


Saved train_denoise2.csv (553472 DENOISE samples)
Saved valid_denoise2.csv (58548 DENOISE samples)
