## 1. 取出训练集和测试集

In [2]:
import os
import math
import pandas as pd

RAW_DIR = "/kaggle/input/traceability/raw"
OUT_DIR = "/kaggle/working/splits"
os.makedirs(OUT_DIR, exist_ok=True)

FILES = {
    "EBT": "full_EBT_link.csv",
    "iTrust": "full_iTrust_link.csv",
    "eTOUR": "full_eTOUR_link.csv",
    "RETRO": "full_RETRO_link.csv",
}

REQUIRED_COLS = ["source_text", "target_text", "label"]

def split_by_label_preserve_order(df: pd.DataFrame, label_value: int):
    """按原始顺序取出指定label的子集，并切分：前80%训练，最后10%测试。"""
    sub = df[df["label"] == label_value].copy()
    n = len(sub)

    # 训练：前80%
    train_n = int(math.floor(n * 0.80))

    # 测试：最后10%（至少0条；若你希望至少1条可改为 max(1, ...)）
    test_n = int(math.floor(n * 0.10))

    train_part = sub.iloc[:train_n].copy()
    test_part = sub.iloc[-test_n:].copy() if test_n > 0 else sub.iloc[0:0].copy()
    return train_part, test_part

def process_one_dataset(name: str, filename: str):
    path = os.path.join(RAW_DIR, filename)
    if not os.path.exists(path):
        raise FileNotFoundError(f"Not found: {path}")

    df = pd.read_csv(path)
    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"{filename} missing columns: {missing}. Got: {list(df.columns)}")

    # label 统一成 int（兼容 '0','1' 或 0.0/1.0）
    df = df.copy()
    df["label"] = pd.to_numeric(df["label"], errors="raise").astype(int)

    # 分别对正负类按原始顺序切分
    pos_train, pos_test = split_by_label_preserve_order(df, 1)
    neg_train, neg_test = split_by_label_preserve_order(df, 0)

    # 组合训练集、测试集（保持你描述的顺序：先1类，再0类；各自内部也保持原始顺序）
    train_set = pd.concat([pos_train, neg_train], ignore_index=True)
    test_set = pd.concat([pos_test, neg_test], ignore_index=True)

    train_out = os.path.join(OUT_DIR, f"{name}_train.csv")
    test_out = os.path.join(OUT_DIR, f"{name}_test.csv")

    train_set.to_csv(train_out, index=False)
    test_set.to_csv(test_out, index=False)

    return {
        "dataset": name,
        "file": filename,
        "pos_total": len(df[df["label"] == 1]),
        "neg_total": len(df[df["label"] == 0]),
        "train_rows": len(train_set),
        "test_rows": len(test_set),
        "train_out": train_out,
        "test_out": test_out,
    }

summaries = []
for ds_name, fn in FILES.items():
    info = process_one_dataset(ds_name, fn)
    summaries.append(info)

summary_df = pd.DataFrame(summaries)
print("Saved splits to:", OUT_DIR)
display(summary_df)


Saved splits to: /kaggle/working/splits


Unnamed: 0,dataset,file,pos_total,neg_total,train_rows,test_rows,train_out,test_out
0,EBT,full_EBT_link.csv,98,98,156,18,/kaggle/working/splits/EBT_train.csv,/kaggle/working/splits/EBT_test.csv
1,iTrust,full_iTrust_link.csv,389,389,622,76,/kaggle/working/splits/iTrust_train.csv,/kaggle/working/splits/iTrust_test.csv
2,eTOUR,full_eTOUR_link.csv,308,308,492,60,/kaggle/working/splits/eTOUR_train.csv,/kaggle/working/splits/eTOUR_test.csv
3,RETRO,full_RETRO_link.csv,158,158,252,30,/kaggle/working/splits/RETRO_train.csv,/kaggle/working/splits/RETRO_test.csv


## 2. 计算每个数据集的训练集和测试集的相似度

In [4]:
import os
import re
import math
import csv
import pandas as pd
from tqdm.auto import tqdm

BASE_DIR = "/kaggle/input/traceability/test/test"
OUT_DIR = "/kaggle/working/jaccard_token3gram_pairs"
os.makedirs(OUT_DIR, exist_ok=True)

DATASETS = ["EBT", "iTrust", "eTOUR", "RETRO"]

def train_path(ds): return os.path.join(BASE_DIR, f"{ds}_train.csv")
def test_path(ds):  return os.path.join(BASE_DIR, f"{ds}_test.csv")

# 如果规模太大，可先限制
MAX_TRAIN = None   # e.g., 2000
MAX_TEST  = None   # e.g., 1000


# ===============================
# Token-level 3-gram utilities
# ===============================
def normalize_and_tokenize(s: str):
    """
    strip() + split()，统一用于需求文本和源代码
    """
    if s is None or (isinstance(s, float) and math.isnan(s)):
        return []
    return str(s).strip().split()

def token_ngrams(tokens, n=3):
    """
    连续 token 3-gram
    """
    if len(tokens) < n:
        return set()
    return {tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)}

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)


# ===============================
# Main computation
# ===============================
def compute_and_save_pairs(ds: str):
    tr_file, te_file = train_path(ds), test_path(ds)
    if not os.path.exists(tr_file) or not os.path.exists(te_file):
        raise FileNotFoundError(f"{ds}: train/test file missing")

    train_df = pd.read_csv(tr_file)
    test_df  = pd.read_csv(te_file)

    if train_df.shape[1] < 2 or test_df.shape[1] < 2:
        raise ValueError(f"{ds}: CSV 至少需要两列（需求文本、源代码）")

    # 第一列：需求文本；第二列：源代码
    train_req  = train_df.iloc[:, 0].tolist()
    train_code = train_df.iloc[:, 1].tolist()
    test_req   = test_df.iloc[:, 0].tolist()
    test_code  = test_df.iloc[:, 1].tolist()

    if MAX_TRAIN is not None:
        train_req, train_code = train_req[:MAX_TRAIN], train_code[:MAX_TRAIN]
    if MAX_TEST is not None:
        test_req, test_code = test_req[:MAX_TEST], test_code[:MAX_TEST]

    # 预计算 token 3-grams
    train_req_grams = [
        token_ngrams(normalize_and_tokenize(x), 3)
        for x in tqdm(train_req, desc=f"{ds} train req token-3gram")
    ]
    train_code_grams = [
        token_ngrams(normalize_and_tokenize(x), 3)
        for x in tqdm(train_code, desc=f"{ds} train code token-3gram")
    ]
    test_req_grams = [
        token_ngrams(normalize_and_tokenize(x), 3)
        for x in tqdm(test_req, desc=f"{ds} test req token-3gram")
    ]
    test_code_grams = [
        token_ngrams(normalize_and_tokenize(x), 3)
        for x in tqdm(test_code, desc=f"{ds} test code token-3gram")
    ]

    n_tr, n_te = len(train_req_grams), len(test_req_grams)
    total_pairs = n_tr * n_te

    out_csv = os.path.join(
        OUT_DIR, f"{ds}_train_test_token3gram_jaccard_pairs.csv"
    )

    sum_avg, written = 0.0, 0

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=["train_idx", "test_idx", "req_sim", "code_sim", "avg_sim"]
        )
        writer.writeheader()

        pbar = tqdm(total=total_pairs, desc=f"{ds} writing pairs")
        for i in range(n_tr):
            for j in range(n_te):
                req_sim = jaccard(train_req_grams[i], test_req_grams[j])
                code_sim = jaccard(train_code_grams[i], test_code_grams[j])
                avg_sim = (req_sim + code_sim) / 2.0

                writer.writerow({
                    "train_idx": i,
                    "test_idx": j,
                    "req_sim": req_sim,
                    "code_sim": code_sim,
                    "avg_sim": avg_sim
                })

                sum_avg += avg_sim
                written += 1
                pbar.update(1)
        pbar.close()

    return {
        "dataset": ds,
        "train_rows": n_tr,
        "test_rows": n_te,
        "pairs": written,
        "avg_similarity": sum_avg / written if written else float("nan"),
        "pairs_csv": out_csv
    }


# ===============================
# Run all datasets
# ===============================
results = []
for ds in DATASETS:
    results.append(compute_and_save_pairs(ds))

res_df = pd.DataFrame(results).sort_values("dataset")
print("Token 3-gram Jaccard pairwise files saved to:", OUT_DIR)
display(res_df)


EBT train req token-3gram:   0%|          | 0/156 [00:00<?, ?it/s]

EBT train code token-3gram:   0%|          | 0/156 [00:00<?, ?it/s]

EBT test req token-3gram:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code token-3gram:   0%|          | 0/18 [00:00<?, ?it/s]

EBT writing pairs:   0%|          | 0/2808 [00:00<?, ?it/s]

iTrust train req token-3gram:   0%|          | 0/622 [00:00<?, ?it/s]

iTrust train code token-3gram:   0%|          | 0/622 [00:00<?, ?it/s]

iTrust test req token-3gram:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code token-3gram:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust writing pairs:   0%|          | 0/47272 [00:00<?, ?it/s]

eTOUR train req token-3gram:   0%|          | 0/492 [00:00<?, ?it/s]

eTOUR train code token-3gram:   0%|          | 0/492 [00:00<?, ?it/s]

eTOUR test req token-3gram:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code token-3gram:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR writing pairs:   0%|          | 0/29520 [00:00<?, ?it/s]

RETRO train req token-3gram:   0%|          | 0/252 [00:00<?, ?it/s]

RETRO train code token-3gram:   0%|          | 0/252 [00:00<?, ?it/s]

RETRO test req token-3gram:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code token-3gram:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO writing pairs:   0%|          | 0/7560 [00:00<?, ?it/s]

Token 3-gram Jaccard pairwise files saved to: /kaggle/working/jaccard_token3gram_pairs


Unnamed: 0,dataset,train_rows,test_rows,pairs,avg_similarity,pairs_csv
0,EBT,156,18,2808,0.064263,/kaggle/working/jaccard_token3gram_pairs/EBT_t...
3,RETRO,252,30,7560,0.085507,/kaggle/working/jaccard_token3gram_pairs/RETRO...
2,eTOUR,492,60,29520,0.099593,/kaggle/working/jaccard_token3gram_pairs/eTOUR...
1,iTrust,622,76,47272,0.031294,/kaggle/working/jaccard_token3gram_pairs/iTrus...


## 3. 计算gpt-4o的增强数据与测试数据的相似度

In [5]:
import os
import math
import csv
import pandas as pd
from tqdm.auto import tqdm

# =========================
# Paths
# =========================
TEST_DIR = "/kaggle/input/traceability/test/test"
GEN_DIR  = "/kaggle/input/traceability/gpt4o/gpt4o"
OUT_DIR  = "/kaggle/working/test_vs_gen_token3gram_pairs"
os.makedirs(OUT_DIR, exist_ok=True)

DATASETS = ["EBT", "iTrust", "eTOUR", "RETRO"]

# 需要计算的 gen 文件模式（每个文件本身就是一个 gen_set：三列 source_text/target_text/label）
GEN_PATTERNS = [
    "full_pure_gen_requirement_{ds}.xlsx",
    "full_pure_gen_Code_{ds}.xlsx",
    "full_pure_example_gen_requirement_{ds}.xlsx",
    "full_pure_example_gen_Code_{ds}.xlsx",
]

REQUIRED_COLS = ["source_text", "target_text", "label"]

# =========================
# Token 3-gram Jaccard
# =========================
def normalize_and_tokenize(s: str):
    if s is None or (isinstance(s, float) and math.isnan(s)):
        return []
    return str(s).strip().split()

def token_ngrams(tokens, n=3):
    if len(tokens) < n:
        return set()
    return {tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)}

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

# =========================
# Loaders
# =========================
def load_test_set(ds: str):
    path = os.path.join(TEST_DIR, f"{ds}_test.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing test set: {path}")
    df = pd.read_csv(path)

    # test csv: 三列 source_text/target_text/label（或至少前两列是这两个）
    if "source_text" in df.columns and "target_text" in df.columns:
        req = df["source_text"].tolist()
        code = df["target_text"].tolist()
    else:
        if df.shape[1] < 2:
            raise ValueError(f"{ds}_test.csv must have at least 2 columns.")
        req = df.iloc[:, 0].tolist()
        code = df.iloc[:, 1].tolist()

    return req, code, path

def load_gen_xlsx(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing gen file: {path}")
    df = pd.read_excel(path)

    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"{os.path.basename(path)} missing columns: {missing}. "
                         f"Got columns: {list(df.columns)}")

    req = df["source_text"].tolist()
    code = df["target_text"].tolist()
    return req, code, df

# =========================
# Compute & save pairwise (test × genfile)
# =========================
def compute_and_save_test_vs_genfile(ds: str, gen_file: str):
    test_req, test_code, test_path = load_test_set(ds)

    gen_path = os.path.join(GEN_DIR, gen_file)
    gen_req, gen_code, gen_df = load_gen_xlsx(gen_path)

    # 预计算 grams
    test_req_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_req,  desc=f"{ds} test req grams")]
    test_code_grams = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_code, desc=f"{ds} test code grams")]

    gen_req_grams   = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_req,  desc=f"{ds} {gen_file} gen req grams")]
    gen_code_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_code, desc=f"{ds} {gen_file} gen code grams")]

    n_te, n_ge = len(test_req_grams), len(gen_req_grams)
    total_pairs = n_te * n_ge

    tag = os.path.splitext(gen_file)[0]  # 去掉 .xlsx
    out_csv = os.path.join(OUT_DIR, f"{ds}_test_vs_{tag}_token3gram_jaccard_pairs.csv")

    sum_avg = 0.0
    written = 0

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=["test_idx", "gen_idx", "req_sim", "code_sim", "avg_sim"]
        )
        writer.writeheader()

        pbar = tqdm(total=total_pairs, desc=f"{ds} writing {tag}")
        for ti in range(n_te):
            for gi in range(n_ge):
                req_sim = jaccard(test_req_grams[ti], gen_req_grams[gi])
                code_sim = jaccard(test_code_grams[ti], gen_code_grams[gi])
                avg_sim = (req_sim + code_sim) / 2.0

                writer.writerow({
                    "test_idx": ti,
                    "gen_idx": gi,
                    "req_sim": req_sim,
                    "code_sim": code_sim,
                    "avg_sim": avg_sim
                })

                sum_avg += avg_sim
                written += 1
                pbar.update(1)
        pbar.close()

    avg_similarity = sum_avg / written if written else float("nan")

    return {
        "dataset": ds,
        "gen_file": gen_file,
        "test_rows": n_te,
        "gen_rows": n_ge,
        "pairs": written,
        "avg_similarity": avg_similarity,
        "pairs_csv": out_csv,
        "test_path": test_path,
        "gen_path": gen_path,
    }

# =========================
# Run all datasets × all gen files
# =========================
results = []
for ds in DATASETS:
    for pattern in GEN_PATTERNS:
        gen_file = pattern.format(ds=ds)
        results.append(compute_and_save_test_vs_genfile(ds, gen_file))

res_df = pd.DataFrame(results).sort_values(["dataset", "gen_file"])
print("Pairwise CSV saved to:", OUT_DIR)
display(res_df)


EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT writing full_pure_gen_requirement_EBT:   0%|          | 0/3528 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT writing full_pure_gen_Code_EBT:   0%|          | 0/3528 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_requirement_EBT:   0%|          | 0/3312 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_Code_EBT:   0%|          | 0/3312 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/778 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/778 [00:00<?, ?it/s]

iTrust writing full_pure_gen_requirement_iTrust:   0%|          | 0/59128 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/776 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/776 [00:00<?, ?it/s]

iTrust writing full_pure_gen_Code_iTrust:   0%|          | 0/58976 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/794 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/794 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_requirement_iTrust:   0%|          | 0/60344 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/790 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/790 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_Code_iTrust:   0%|          | 0/60040 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_requirement_eTOUR:   0%|          | 0/36960 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_Code_eTOUR:   0%|          | 0/36960 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_requirement_eTOUR:   0%|          | 0/36120 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_Code_eTOUR:   0%|          | 0/36120 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO writing full_pure_gen_requirement_RETRO:   0%|          | 0/9480 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO writing full_pure_gen_Code_RETRO:   0%|          | 0/9480 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/290 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/290 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_requirement_RETRO:   0%|          | 0/8700 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/290 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/290 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_Code_RETRO:   0%|          | 0/8700 [00:00<?, ?it/s]

Pairwise CSV saved to: /kaggle/working/test_vs_gen_token3gram_pairs


Unnamed: 0,dataset,gen_file,test_rows,gen_rows,pairs,avg_similarity,pairs_csv,test_path,gen_path
3,EBT,full_pure_example_gen_Code_EBT.xlsx,18,184,3312,0.043748,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
2,EBT,full_pure_example_gen_requirement_EBT.xlsx,18,184,3312,0.027313,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
1,EBT,full_pure_gen_Code_EBT.xlsx,18,196,3528,0.042921,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
0,EBT,full_pure_gen_requirement_EBT.xlsx,18,196,3528,0.025457,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
15,RETRO,full_pure_example_gen_Code_RETRO.xlsx,30,290,8700,0.069817,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
14,RETRO,full_pure_example_gen_requirement_RETRO.xlsx,30,290,8700,0.028758,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
13,RETRO,full_pure_gen_Code_RETRO.xlsx,30,316,9480,0.070389,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
12,RETRO,full_pure_gen_requirement_RETRO.xlsx,30,316,9480,0.020851,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
11,eTOUR,full_pure_example_gen_Code_eTOUR.xlsx,60,602,36120,0.077035,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...
10,eTOUR,full_pure_example_gen_requirement_eTOUR.xlsx,60,602,36120,0.027218,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/gpt4o/gpt4o/full_pu...


## 4. 计算claude的增强数据与测试数据的相似度

In [6]:
import os
import math
import csv
import pandas as pd
from tqdm.auto import tqdm

# =========================
# Paths
# =========================
TEST_DIR = "/kaggle/input/traceability/test/test"
GEN_DIR  = "/kaggle/input/traceability/claude3/claude3"
OUT_DIR  = "/kaggle/working/test_vs_gen_token3gram_pairs"
os.makedirs(OUT_DIR, exist_ok=True)

DATASETS = ["EBT", "iTrust", "eTOUR", "RETRO"]

# 需要计算的 gen 文件模式（每个文件本身就是一个 gen_set：三列 source_text/target_text/label）
GEN_PATTERNS = [
    "full_pure_gen_requirement_{ds}.xlsx",
    "full_pure_gen_Code_{ds}.xlsx",
    "full_pure_example_gen_requirement_{ds}.xlsx",
    "full_pure_example_gen_Code_{ds}.xlsx",
]

REQUIRED_COLS = ["source_text", "target_text", "label"]

# =========================
# Token 3-gram Jaccard
# =========================
def normalize_and_tokenize(s: str):
    if s is None or (isinstance(s, float) and math.isnan(s)):
        return []
    return str(s).strip().split()

def token_ngrams(tokens, n=3):
    if len(tokens) < n:
        return set()
    return {tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)}

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

# =========================
# Loaders
# =========================
def load_test_set(ds: str):
    path = os.path.join(TEST_DIR, f"{ds}_test.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing test set: {path}")
    df = pd.read_csv(path)

    # test csv: 三列 source_text/target_text/label（或至少前两列是这两个）
    if "source_text" in df.columns and "target_text" in df.columns:
        req = df["source_text"].tolist()
        code = df["target_text"].tolist()
    else:
        if df.shape[1] < 2:
            raise ValueError(f"{ds}_test.csv must have at least 2 columns.")
        req = df.iloc[:, 0].tolist()
        code = df.iloc[:, 1].tolist()

    return req, code, path

def load_gen_xlsx(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing gen file: {path}")
    df = pd.read_excel(path)

    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"{os.path.basename(path)} missing columns: {missing}. "
                         f"Got columns: {list(df.columns)}")

    req = df["source_text"].tolist()
    code = df["target_text"].tolist()
    return req, code, df

# =========================
# Compute & save pairwise (test × genfile)
# =========================
def compute_and_save_test_vs_genfile(ds: str, gen_file: str):
    test_req, test_code, test_path = load_test_set(ds)

    gen_path = os.path.join(GEN_DIR, gen_file)
    gen_req, gen_code, gen_df = load_gen_xlsx(gen_path)

    # 预计算 grams
    test_req_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_req,  desc=f"{ds} test req grams")]
    test_code_grams = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_code, desc=f"{ds} test code grams")]

    gen_req_grams   = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_req,  desc=f"{ds} {gen_file} gen req grams")]
    gen_code_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_code, desc=f"{ds} {gen_file} gen code grams")]

    n_te, n_ge = len(test_req_grams), len(gen_req_grams)
    total_pairs = n_te * n_ge

    tag = os.path.splitext(gen_file)[0]  # 去掉 .xlsx
    out_csv = os.path.join(OUT_DIR, f"{ds}_test_vs_{tag}_token3gram_jaccard_pairs.csv")

    sum_avg = 0.0
    written = 0

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=["test_idx", "gen_idx", "req_sim", "code_sim", "avg_sim"]
        )
        writer.writeheader()

        pbar = tqdm(total=total_pairs, desc=f"{ds} writing {tag}")
        for ti in range(n_te):
            for gi in range(n_ge):
                req_sim = jaccard(test_req_grams[ti], gen_req_grams[gi])
                code_sim = jaccard(test_code_grams[ti], gen_code_grams[gi])
                avg_sim = (req_sim + code_sim) / 2.0

                writer.writerow({
                    "test_idx": ti,
                    "gen_idx": gi,
                    "req_sim": req_sim,
                    "code_sim": code_sim,
                    "avg_sim": avg_sim
                })

                sum_avg += avg_sim
                written += 1
                pbar.update(1)
        pbar.close()

    avg_similarity = sum_avg / written if written else float("nan")

    return {
        "dataset": ds,
        "gen_file": gen_file,
        "test_rows": n_te,
        "gen_rows": n_ge,
        "pairs": written,
        "avg_similarity": avg_similarity,
        "pairs_csv": out_csv,
        "test_path": test_path,
        "gen_path": gen_path,
    }

# =========================
# Run all datasets × all gen files
# =========================
results = []
for ds in DATASETS:
    for pattern in GEN_PATTERNS:
        gen_file = pattern.format(ds=ds)
        results.append(compute_and_save_test_vs_genfile(ds, gen_file))

res_df = pd.DataFrame(results).sort_values(["dataset", "gen_file"])
print("Pairwise CSV saved to:", OUT_DIR)
display(res_df)


EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT writing full_pure_gen_requirement_EBT:   0%|          | 0/3528 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT writing full_pure_gen_Code_EBT:   0%|          | 0/3528 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_requirement_EBT:   0%|          | 0/3312 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_Code_EBT:   0%|          | 0/3312 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/778 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/778 [00:00<?, ?it/s]

iTrust writing full_pure_gen_requirement_iTrust:   0%|          | 0/59128 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/778 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/778 [00:00<?, ?it/s]

iTrust writing full_pure_gen_Code_iTrust:   0%|          | 0/59128 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/794 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/794 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_requirement_iTrust:   0%|          | 0/60344 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/794 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/794 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_Code_iTrust:   0%|          | 0/60344 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_requirement_eTOUR:   0%|          | 0/36960 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_Code_eTOUR:   0%|          | 0/36960 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_requirement_eTOUR:   0%|          | 0/36120 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_Code_eTOUR:   0%|          | 0/36120 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO writing full_pure_gen_requirement_RETRO:   0%|          | 0/9480 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO writing full_pure_gen_Code_RETRO:   0%|          | 0/9480 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/262 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/262 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_requirement_RETRO:   0%|          | 0/7860 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/290 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/290 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_Code_RETRO:   0%|          | 0/8700 [00:00<?, ?it/s]

Pairwise CSV saved to: /kaggle/working/test_vs_gen_token3gram_pairs


Unnamed: 0,dataset,gen_file,test_rows,gen_rows,pairs,avg_similarity,pairs_csv,test_path,gen_path
3,EBT,full_pure_example_gen_Code_EBT.xlsx,18,184,3312,0.043738,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/claude3/claude3/ful...
2,EBT,full_pure_example_gen_requirement_EBT.xlsx,18,184,3312,0.027579,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/claude3/claude3/ful...
1,EBT,full_pure_gen_Code_EBT.xlsx,18,196,3528,0.042793,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/claude3/claude3/ful...
0,EBT,full_pure_gen_requirement_EBT.xlsx,18,196,3528,0.02545,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/claude3/claude3/ful...
15,RETRO,full_pure_example_gen_Code_RETRO.xlsx,30,290,8700,0.070915,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/claude3/claude3/ful...
14,RETRO,full_pure_example_gen_requirement_RETRO.xlsx,30,262,7860,0.045003,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/claude3/claude3/ful...
13,RETRO,full_pure_gen_Code_RETRO.xlsx,30,316,9480,0.071001,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/claude3/claude3/ful...
12,RETRO,full_pure_gen_requirement_RETRO.xlsx,30,316,9480,0.022032,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/claude3/claude3/ful...
11,eTOUR,full_pure_example_gen_Code_eTOUR.xlsx,60,602,36120,0.077469,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/claude3/claude3/ful...
10,eTOUR,full_pure_example_gen_requirement_eTOUR.xlsx,60,602,36120,0.03087,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/claude3/claude3/ful...


## 5. 计算gemini的增强数据与测试数据的相似度

In [7]:
import os
import math
import csv
import pandas as pd
from tqdm.auto import tqdm

# =========================
# Paths
# =========================
TEST_DIR = "/kaggle/input/traceability/test/test"
GEN_DIR  = "/kaggle/input/traceability/gemini/gemini"
OUT_DIR  = "/kaggle/working/test_vs_gen_token3gram_pairs"
os.makedirs(OUT_DIR, exist_ok=True)

DATASETS = ["EBT", "iTrust", "eTOUR", "RETRO"]

# 需要计算的 gen 文件模式（每个文件本身就是一个 gen_set：三列 source_text/target_text/label）
GEN_PATTERNS = [
    "full_pure_gen_requirement_{ds}.xlsx",
    "full_pure_gen_Code_{ds}.xlsx",
    "full_pure_example_gen_requirement_{ds}.xlsx",
    "full_pure_example_gen_Code_{ds}.xlsx",
]

REQUIRED_COLS = ["source_text", "target_text", "label"]

# =========================
# Token 3-gram Jaccard
# =========================
def normalize_and_tokenize(s: str):
    if s is None or (isinstance(s, float) and math.isnan(s)):
        return []
    return str(s).strip().split()

def token_ngrams(tokens, n=3):
    if len(tokens) < n:
        return set()
    return {tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)}

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

# =========================
# Loaders
# =========================
def load_test_set(ds: str):
    path = os.path.join(TEST_DIR, f"{ds}_test.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing test set: {path}")
    df = pd.read_csv(path)

    # test csv: 三列 source_text/target_text/label（或至少前两列是这两个）
    if "source_text" in df.columns and "target_text" in df.columns:
        req = df["source_text"].tolist()
        code = df["target_text"].tolist()
    else:
        if df.shape[1] < 2:
            raise ValueError(f"{ds}_test.csv must have at least 2 columns.")
        req = df.iloc[:, 0].tolist()
        code = df.iloc[:, 1].tolist()

    return req, code, path

def load_gen_xlsx(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing gen file: {path}")
    df = pd.read_excel(path)

    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"{os.path.basename(path)} missing columns: {missing}. "
                         f"Got columns: {list(df.columns)}")

    req = df["source_text"].tolist()
    code = df["target_text"].tolist()
    return req, code, df

# =========================
# Compute & save pairwise (test × genfile)
# =========================
def compute_and_save_test_vs_genfile(ds: str, gen_file: str):
    test_req, test_code, test_path = load_test_set(ds)

    gen_path = os.path.join(GEN_DIR, gen_file)
    gen_req, gen_code, gen_df = load_gen_xlsx(gen_path)

    # 预计算 grams
    test_req_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_req,  desc=f"{ds} test req grams")]
    test_code_grams = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_code, desc=f"{ds} test code grams")]

    gen_req_grams   = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_req,  desc=f"{ds} {gen_file} gen req grams")]
    gen_code_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_code, desc=f"{ds} {gen_file} gen code grams")]

    n_te, n_ge = len(test_req_grams), len(gen_req_grams)
    total_pairs = n_te * n_ge

    tag = os.path.splitext(gen_file)[0]  # 去掉 .xlsx
    out_csv = os.path.join(OUT_DIR, f"{ds}_test_vs_{tag}_token3gram_jaccard_pairs.csv")

    sum_avg = 0.0
    written = 0

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=["test_idx", "gen_idx", "req_sim", "code_sim", "avg_sim"]
        )
        writer.writeheader()

        pbar = tqdm(total=total_pairs, desc=f"{ds} writing {tag}")
        for ti in range(n_te):
            for gi in range(n_ge):
                req_sim = jaccard(test_req_grams[ti], gen_req_grams[gi])
                code_sim = jaccard(test_code_grams[ti], gen_code_grams[gi])
                avg_sim = (req_sim + code_sim) / 2.0

                writer.writerow({
                    "test_idx": ti,
                    "gen_idx": gi,
                    "req_sim": req_sim,
                    "code_sim": code_sim,
                    "avg_sim": avg_sim
                })

                sum_avg += avg_sim
                written += 1
                pbar.update(1)
        pbar.close()

    avg_similarity = sum_avg / written if written else float("nan")

    return {
        "dataset": ds,
        "gen_file": gen_file,
        "test_rows": n_te,
        "gen_rows": n_ge,
        "pairs": written,
        "avg_similarity": avg_similarity,
        "pairs_csv": out_csv,
        "test_path": test_path,
        "gen_path": gen_path,
    }

# =========================
# Run all datasets × all gen files
# =========================
results = []
for ds in DATASETS:
    for pattern in GEN_PATTERNS:
        gen_file = pattern.format(ds=ds)
        results.append(compute_and_save_test_vs_genfile(ds, gen_file))

res_df = pd.DataFrame(results).sort_values(["dataset", "gen_file"])
print("Pairwise CSV saved to:", OUT_DIR)
display(res_df)


EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT writing full_pure_gen_requirement_EBT:   0%|          | 0/3528 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/176 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/176 [00:00<?, ?it/s]

EBT writing full_pure_gen_Code_EBT:   0%|          | 0/3168 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_requirement_EBT:   0%|          | 0/3312 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/178 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/178 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_Code_EBT:   0%|          | 0/3204 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/764 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/764 [00:00<?, ?it/s]

iTrust writing full_pure_gen_requirement_iTrust:   0%|          | 0/58064 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/692 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/692 [00:00<?, ?it/s]

iTrust writing full_pure_gen_Code_iTrust:   0%|          | 0/52592 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/474 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/474 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_requirement_iTrust:   0%|          | 0/36024 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/758 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/758 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_Code_iTrust:   0%|          | 0/57608 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/616 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_requirement_eTOUR:   0%|          | 0/36960 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/598 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/598 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_Code_eTOUR:   0%|          | 0/35880 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/600 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/600 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_requirement_eTOUR:   0%|          | 0/36000 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/596 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/596 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_Code_eTOUR:   0%|          | 0/35760 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO writing full_pure_gen_requirement_RETRO:   0%|          | 0/9480 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/540 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/540 [00:00<?, ?it/s]

RETRO writing full_pure_gen_Code_RETRO:   0%|          | 0/16200 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/208 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/208 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_requirement_RETRO:   0%|          | 0/6240 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/286 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/286 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_Code_RETRO:   0%|          | 0/8580 [00:00<?, ?it/s]

Pairwise CSV saved to: /kaggle/working/test_vs_gen_token3gram_pairs


Unnamed: 0,dataset,gen_file,test_rows,gen_rows,pairs,avg_similarity,pairs_csv,test_path,gen_path
3,EBT,full_pure_example_gen_Code_EBT.xlsx,18,178,3204,0.043963,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gemini/gemini/full_...
2,EBT,full_pure_example_gen_requirement_EBT.xlsx,18,184,3312,0.030074,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gemini/gemini/full_...
1,EBT,full_pure_gen_Code_EBT.xlsx,18,176,3168,0.040814,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gemini/gemini/full_...
0,EBT,full_pure_gen_requirement_EBT.xlsx,18,196,3528,0.025988,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gemini/gemini/full_...
15,RETRO,full_pure_example_gen_Code_RETRO.xlsx,30,286,8580,0.071021,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gemini/gemini/full_...
14,RETRO,full_pure_example_gen_requirement_RETRO.xlsx,30,208,6240,0.031953,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gemini/gemini/full_...
13,RETRO,full_pure_gen_Code_RETRO.xlsx,30,540,16200,0.045278,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gemini/gemini/full_...
12,RETRO,full_pure_gen_requirement_RETRO.xlsx,30,316,9480,0.023063,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gemini/gemini/full_...
11,eTOUR,full_pure_example_gen_Code_eTOUR.xlsx,60,596,35760,0.076595,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/gemini/gemini/full_...
10,eTOUR,full_pure_example_gen_requirement_eTOUR.xlsx,60,600,36000,0.028104,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/gemini/gemini/full_...


## 6. 计算gpt-3.5的增强数据与测试数据的相似度

In [8]:
import os
import math
import csv
import pandas as pd
from tqdm.auto import tqdm

# =========================
# Paths
# =========================
TEST_DIR = "/kaggle/input/traceability/test/test"
GEN_DIR  = "/kaggle/input/traceability/gpt3.5/gpt3.5"
OUT_DIR  = "/kaggle/working/test_vs_gen_token3gram_pairs"
os.makedirs(OUT_DIR, exist_ok=True)

DATASETS = ["EBT", "iTrust", "eTOUR", "RETRO"]

# 需要计算的 gen 文件模式（每个文件本身就是一个 gen_set：三列 source_text/target_text/label）
GEN_PATTERNS = [
    "full_pure_gen_requirement_{ds}.xlsx",
    "full_pure_gen_Code_{ds}.xlsx",
    "full_pure_example_gen_requirement_{ds}.xlsx",
    "full_pure_example_gen_Code_{ds}.xlsx",
]

REQUIRED_COLS = ["source_text", "target_text", "label"]

# =========================
# Token 3-gram Jaccard
# =========================
def normalize_and_tokenize(s: str):
    if s is None or (isinstance(s, float) and math.isnan(s)):
        return []
    return str(s).strip().split()

def token_ngrams(tokens, n=3):
    if len(tokens) < n:
        return set()
    return {tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)}

def jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 1.0
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

# =========================
# Loaders
# =========================
def load_test_set(ds: str):
    path = os.path.join(TEST_DIR, f"{ds}_test.csv")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing test set: {path}")
    df = pd.read_csv(path)

    # test csv: 三列 source_text/target_text/label（或至少前两列是这两个）
    if "source_text" in df.columns and "target_text" in df.columns:
        req = df["source_text"].tolist()
        code = df["target_text"].tolist()
    else:
        if df.shape[1] < 2:
            raise ValueError(f"{ds}_test.csv must have at least 2 columns.")
        req = df.iloc[:, 0].tolist()
        code = df.iloc[:, 1].tolist()

    return req, code, path

def load_gen_xlsx(path: str):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing gen file: {path}")
    df = pd.read_excel(path)

    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"{os.path.basename(path)} missing columns: {missing}. "
                         f"Got columns: {list(df.columns)}")

    req = df["source_text"].tolist()
    code = df["target_text"].tolist()
    return req, code, df

# =========================
# Compute & save pairwise (test × genfile)
# =========================
def compute_and_save_test_vs_genfile(ds: str, gen_file: str):
    test_req, test_code, test_path = load_test_set(ds)

    gen_path = os.path.join(GEN_DIR, gen_file)
    gen_req, gen_code, gen_df = load_gen_xlsx(gen_path)

    # 预计算 grams
    test_req_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_req,  desc=f"{ds} test req grams")]
    test_code_grams = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(test_code, desc=f"{ds} test code grams")]

    gen_req_grams   = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_req,  desc=f"{ds} {gen_file} gen req grams")]
    gen_code_grams  = [token_ngrams(normalize_and_tokenize(x), 3)
                       for x in tqdm(gen_code, desc=f"{ds} {gen_file} gen code grams")]

    n_te, n_ge = len(test_req_grams), len(gen_req_grams)
    total_pairs = n_te * n_ge

    tag = os.path.splitext(gen_file)[0]  # 去掉 .xlsx
    out_csv = os.path.join(OUT_DIR, f"{ds}_test_vs_{tag}_token3gram_jaccard_pairs.csv")

    sum_avg = 0.0
    written = 0

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=["test_idx", "gen_idx", "req_sim", "code_sim", "avg_sim"]
        )
        writer.writeheader()

        pbar = tqdm(total=total_pairs, desc=f"{ds} writing {tag}")
        for ti in range(n_te):
            for gi in range(n_ge):
                req_sim = jaccard(test_req_grams[ti], gen_req_grams[gi])
                code_sim = jaccard(test_code_grams[ti], gen_code_grams[gi])
                avg_sim = (req_sim + code_sim) / 2.0

                writer.writerow({
                    "test_idx": ti,
                    "gen_idx": gi,
                    "req_sim": req_sim,
                    "code_sim": code_sim,
                    "avg_sim": avg_sim
                })

                sum_avg += avg_sim
                written += 1
                pbar.update(1)
        pbar.close()

    avg_similarity = sum_avg / written if written else float("nan")

    return {
        "dataset": ds,
        "gen_file": gen_file,
        "test_rows": n_te,
        "gen_rows": n_ge,
        "pairs": written,
        "avg_similarity": avg_similarity,
        "pairs_csv": out_csv,
        "test_path": test_path,
        "gen_path": gen_path,
    }

# =========================
# Run all datasets × all gen files
# =========================
results = []
for ds in DATASETS:
    for pattern in GEN_PATTERNS:
        gen_file = pattern.format(ds=ds)
        results.append(compute_and_save_test_vs_genfile(ds, gen_file))

res_df = pd.DataFrame(results).sort_values(["dataset", "gen_file"])
print("Pairwise CSV saved to:", OUT_DIR)
display(res_df)


EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT full_pure_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/196 [00:00<?, ?it/s]

EBT writing full_pure_gen_requirement_EBT:   0%|          | 0/3528 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/166 [00:00<?, ?it/s]

EBT full_pure_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/166 [00:00<?, ?it/s]

EBT writing full_pure_gen_Code_EBT:   0%|          | 0/2988 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen req grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT full_pure_example_gen_requirement_EBT.xlsx gen code grams:   0%|          | 0/184 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_requirement_EBT:   0%|          | 0/3312 [00:00<?, ?it/s]

EBT test req grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT test code grams:   0%|          | 0/18 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen req grams:   0%|          | 0/180 [00:00<?, ?it/s]

EBT full_pure_example_gen_Code_EBT.xlsx gen code grams:   0%|          | 0/180 [00:00<?, ?it/s]

EBT writing full_pure_example_gen_Code_EBT:   0%|          | 0/3240 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/688 [00:00<?, ?it/s]

iTrust full_pure_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/688 [00:00<?, ?it/s]

iTrust writing full_pure_gen_requirement_iTrust:   0%|          | 0/52288 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/772 [00:00<?, ?it/s]

iTrust full_pure_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/772 [00:00<?, ?it/s]

iTrust writing full_pure_gen_Code_iTrust:   0%|          | 0/58672 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen req grams:   0%|          | 0/786 [00:00<?, ?it/s]

iTrust full_pure_example_gen_requirement_iTrust.xlsx gen code grams:   0%|          | 0/786 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_requirement_iTrust:   0%|          | 0/59736 [00:00<?, ?it/s]

iTrust test req grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust test code grams:   0%|          | 0/76 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen req grams:   0%|          | 0/738 [00:00<?, ?it/s]

iTrust full_pure_example_gen_Code_iTrust.xlsx gen code grams:   0%|          | 0/738 [00:00<?, ?it/s]

iTrust writing full_pure_example_gen_Code_iTrust:   0%|          | 0/56088 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/554 [00:00<?, ?it/s]

eTOUR full_pure_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/554 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_requirement_eTOUR:   0%|          | 0/33240 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR full_pure_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/602 [00:00<?, ?it/s]

eTOUR writing full_pure_gen_Code_eTOUR:   0%|          | 0/36120 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen req grams:   0%|          | 0/544 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_requirement_eTOUR.xlsx gen code grams:   0%|          | 0/544 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_requirement_eTOUR:   0%|          | 0/32640 [00:00<?, ?it/s]

eTOUR test req grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR test code grams:   0%|          | 0/60 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen req grams:   0%|          | 0/600 [00:00<?, ?it/s]

eTOUR full_pure_example_gen_Code_eTOUR.xlsx gen code grams:   0%|          | 0/600 [00:00<?, ?it/s]

eTOUR writing full_pure_example_gen_Code_eTOUR:   0%|          | 0/36000 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/282 [00:00<?, ?it/s]

RETRO full_pure_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/282 [00:00<?, ?it/s]

RETRO writing full_pure_gen_requirement_RETRO:   0%|          | 0/8460 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO full_pure_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/316 [00:00<?, ?it/s]

RETRO writing full_pure_gen_Code_RETRO:   0%|          | 0/9480 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen req grams:   0%|          | 0/282 [00:00<?, ?it/s]

RETRO full_pure_example_gen_requirement_RETRO.xlsx gen code grams:   0%|          | 0/282 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_requirement_RETRO:   0%|          | 0/8460 [00:00<?, ?it/s]

RETRO test req grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO test code grams:   0%|          | 0/30 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen req grams:   0%|          | 0/288 [00:00<?, ?it/s]

RETRO full_pure_example_gen_Code_RETRO.xlsx gen code grams:   0%|          | 0/288 [00:00<?, ?it/s]

RETRO writing full_pure_example_gen_Code_RETRO:   0%|          | 0/8640 [00:00<?, ?it/s]

Pairwise CSV saved to: /kaggle/working/test_vs_gen_token3gram_pairs


Unnamed: 0,dataset,gen_file,test_rows,gen_rows,pairs,avg_similarity,pairs_csv,test_path,gen_path
3,EBT,full_pure_example_gen_Code_EBT.xlsx,18,180,3240,0.043626,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
2,EBT,full_pure_example_gen_requirement_EBT.xlsx,18,184,3312,0.027291,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
1,EBT,full_pure_gen_Code_EBT.xlsx,18,166,2988,0.040173,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
0,EBT,full_pure_gen_requirement_EBT.xlsx,18,196,3528,0.025347,/kaggle/working/test_vs_gen_token3gram_pairs/E...,/kaggle/input/traceability/test/test/EBT_test.csv,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
15,RETRO,full_pure_example_gen_Code_RETRO.xlsx,30,288,8640,0.074324,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
14,RETRO,full_pure_example_gen_requirement_RETRO.xlsx,30,282,8460,0.030039,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
13,RETRO,full_pure_gen_Code_RETRO.xlsx,30,316,9480,0.069521,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
12,RETRO,full_pure_gen_requirement_RETRO.xlsx,30,282,8460,0.022254,/kaggle/working/test_vs_gen_token3gram_pairs/R...,/kaggle/input/traceability/test/test/RETRO_tes...,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
11,eTOUR,full_pure_example_gen_Code_eTOUR.xlsx,60,600,36000,0.076572,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
10,eTOUR,full_pure_example_gen_requirement_eTOUR.xlsx,60,544,32640,0.039311,/kaggle/working/test_vs_gen_token3gram_pairs/e...,/kaggle/input/traceability/test/test/eTOUR_tes...,/kaggle/input/traceability/gpt3.5/gpt3.5/full_...
