In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import accuracy_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, recall_score
from scipy.stats import entropy
from collections import defaultdict
import warnings
from sklearn.exceptions import ConvergenceWarning

# Add this near the top of your script or notebook
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Function

In [7]:
total_labeled_targets=[50, 100, 250, 500]

def label_propagation(X, y, strategy="uniform", 
                             initial_labels_per_class=5, seed=42, repeats=5):
    total_labeled_targets = [50, 100, 250, 500]
    n_classes = len(np.unique(y))
    total_samples = len(y)

    all_runs_metrics = defaultdict(list)

    for r in range(repeats):
        rng = np.random.RandomState(seed + r)
        labeled_mask = np.zeros(total_samples, dtype=bool)

        for cls in range(n_classes):
            cls_indices = np.where(y == cls)[0]
            labeled_indices = rng.choice(cls_indices, initial_labels_per_class, replace=False)
            labeled_mask[labeled_indices] = True

        y_partial = -np.ones(total_samples)
        y_partial[labeled_mask] = y[labeled_mask]
        queried = labeled_mask.sum()
        batch_sizes = [10, 30, 50]
        batch_idx = 0
        evaluated_targets = set()

        while queried <= total_labeled_targets[-1]:
            batch_size = batch_sizes[min(batch_idx, len(batch_sizes) - 1)]
            model = LabelPropagation(kernel='rbf', gamma=20, max_iter=1000)
            model.fit(X, y_partial)
            probs = model.label_distributions_
            score = entropy(probs.T)
            unlabeled_indices = np.where(~labeled_mask)[0]

            # === Evaluation ===
            for target in total_labeled_targets:
                if queried >= target and target not in evaluated_targets:
                    pred = model.transduction_
                    all_runs_metrics[target, "Accuracy"].append(accuracy_score(y, pred))
                    all_runs_metrics[target, "ARI"].append(adjusted_rand_score(y, pred))
                    all_runs_metrics[target, "NMI"].append(normalized_mutual_info_score(y, pred))
                    all_runs_metrics[target, "F1"].append(f1_score(y, pred, average="macro"))
                    all_runs_metrics[target, "Recall"].append(recall_score(y, pred, average="macro"))
                    evaluated_targets.add(target)
                    batch_idx += 1

            if queried > total_labeled_targets[-1]:
                break

            # === Query selection ===
            if strategy == "entropy":
                query_indices = unlabeled_indices[np.argsort(score[unlabeled_indices])[-batch_size:]]

            elif strategy == "uniform":
                query_indices = []
                per_class_quota = batch_size // n_classes
                for cls in range(n_classes):
                    cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                    if len(cls_unlabeled) == 0:
                        continue
                    cls_entropy_scores = score[cls_unlabeled]
                    top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                    query_indices.extend(top_cls_indices)
                remaining = batch_size - len(query_indices)
                if remaining > 0:
                    remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                    top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                    query_indices.extend(top_remaining)

            elif strategy == "random":
                query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
            
            elif strategy == "random_then_entropy_1":
                if queried < 100:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)
                    
            elif strategy == "random_then_entropy_2":
                if queried < 250:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)
                
            else:
                print(strategy)
                raise ValueError("Invalid strategy.")

            labeled_mask[query_indices] = True
            y_partial[query_indices] = y[query_indices]
            queried += len(query_indices)

    final_results = []
    for target in total_labeled_targets:
        metrics = {
            "Labeled": target,
            "Accuracy": np.mean(all_runs_metrics[(target, "Accuracy")]),
            "ARI": np.mean(all_runs_metrics[(target, "ARI")]),
            "NMI": np.mean(all_runs_metrics[(target, "NMI")]),
            "F1": np.mean(all_runs_metrics[(target, "F1")]),
            "Recall": np.mean(all_runs_metrics[(target, "Recall")]),
        }
        final_results.append(metrics)

    return final_results

In [8]:
def label_spreading(X, y, strategy="uniform", 
                                             initial_labels_per_class=5, seed=42, repeats=5):
    total_labeled_targets = [50, 100, 250, 500]
    n_classes = len(np.unique(y))
    total_samples = len(y)

    all_runs_metrics = defaultdict(list)

    for r in range(repeats):
        rng = np.random.RandomState(seed + r)
        labeled_mask = np.zeros(total_samples, dtype=bool)

        for cls in range(n_classes):
            cls_indices = np.where(y == cls)[0]
            labeled_indices = rng.choice(cls_indices, initial_labels_per_class, replace=False)
            labeled_mask[labeled_indices] = True

        y_partial = -np.ones(total_samples)
        y_partial[labeled_mask] = y[labeled_mask]
        queried = labeled_mask.sum()
        batch_sizes = [10, 30, 50]
        batch_idx = 0
        evaluated_targets = set()

        while queried <= total_labeled_targets[-1]:
            batch_size = batch_sizes[min(batch_idx, len(batch_sizes) - 1)]
            model = LabelSpreading(kernel='rbf', gamma=20, max_iter=1000)
            model.fit(X, y_partial)
            probs = model.label_distributions_
            score = entropy(probs.T)
            unlabeled_indices = np.where(~labeled_mask)[0]

            for target in total_labeled_targets:
                if queried >= target and target not in evaluated_targets:
                    pred = model.transduction_
                    all_runs_metrics[target, "Accuracy"].append(accuracy_score(y, pred))
                    all_runs_metrics[target, "ARI"].append(adjusted_rand_score(y, pred))
                    all_runs_metrics[target, "NMI"].append(normalized_mutual_info_score(y, pred))
                    all_runs_metrics[target, "F1"].append(f1_score(y, pred, average="macro"))
                    all_runs_metrics[target, "Recall"].append(recall_score(y, pred, average="macro"))
                    evaluated_targets.add(target)
                    batch_idx += 1

            if queried > total_labeled_targets[-1]:
                break

            if strategy == "entropy":
                query_indices = unlabeled_indices[np.argsort(score[unlabeled_indices])[-batch_size:]]

            elif strategy == "uniform":
                query_indices = []
                per_class_quota = batch_size // n_classes
                for cls in range(n_classes):
                    cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                    if len(cls_unlabeled) == 0:
                        continue
                    cls_entropy_scores = score[cls_unlabeled]
                    top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                    query_indices.extend(top_cls_indices)
                remaining = batch_size - len(query_indices)
                if remaining > 0:
                    remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                    top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                    query_indices.extend(top_remaining)

            elif strategy == "random":
                query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)

            elif strategy == "random_then_entropy_1":
                if queried < 100:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)

            elif strategy == "random_then_entropy_2":
                if queried < 250:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)

            else:
                raise ValueError(f"Invalid strategy: {strategy}")

            labeled_mask[query_indices] = True
            y_partial[query_indices] = y[query_indices]
            queried += len(query_indices)

    final_results = []
    for target in total_labeled_targets:
        metrics = {
            "Labeled": target,
            "Accuracy": np.mean(all_runs_metrics[(target, "Accuracy")]),
            "ARI": np.mean(all_runs_metrics[(target, "ARI")]),
            "NMI": np.mean(all_runs_metrics[(target, "NMI")]),
            "F1": np.mean(all_runs_metrics[(target, "F1")]),
            "Recall": np.mean(all_runs_metrics[(target, "Recall")]),
        }
        final_results.append(metrics)

    return final_results

In [9]:
def run_active_learning(file_path, repeats=5, func="label_propagation"):
    df = pd.read_pickle(file_path)
    X = np.vstack(df["embedding"].values)
    y = pd.factorize(df["cluster"])[0]

    strategies = [
        ("random", "Random Sampling"),
        ("entropy", "Entropy-based"),
        ("uniform", "Class-balance Sampling"),
        ("random_then_entropy_1", "Random → Entropy (start <100)"),
        ("random_then_entropy_2", "Random → Entropy (start <250)")
    ]

    # Select the active learning simulation function
    if func == "label_propagation":
        simulate = label_propagation
    elif func == "label_spreading":
        simulate = label_spreading
    else:
        raise ValueError(f"Invalid func: {func}. Must be 'label_propagation' or 'label_spreading'.")

    results = {}
    for strategy_key, _ in strategies:
        results[strategy_key] = simulate(X, y, strategy=strategy_key, repeats=repeats)

    return results



def print_results(model_name, results_dict):
    print(f"=== {model_name} Accuracy Results ===\n")

    strategy_names = {
        "random": "Random Sampling",
        "entropy": "Entropy-based",
        "uniform": "Class-balance Sampling",
        "random_then_entropy_1": "Random → Entropy (start <100)",
        "random_then_entropy_2": "Random → Entropy (start <250)"
    }

    def print_al_metrics(name, metrics):
        print(f"\nActive Learning ({name}):")
        for m in metrics:
            print(f"  {m['Labeled']} labels → Acc: {m['Accuracy']:.3f} | "
                  f"ARI: {m['ARI']:.3f} | NMI: {m['NMI']:.3f} | "
                  f"F1: {m['F1']:.3f} | Recall: {m['Recall']:.3f}")

    for key, name in strategy_names.items():
        if key in results_dict:
            print_al_metrics(name, results_dict[key])

def save_results_to_csv(model_name, results_dict, func, programming_language="c", out_file="results_c.csv"):
    """
    Save active learning results into a CSV file with a long-format structure.
    """
    rows = []
    for strategy, metrics in results_dict.items():
        for m in metrics:
            rows.append({
                "programming_language": programming_language,
                "func": func,   # label_propagation or label_spreading
                "embedding": model_name,  # CodeBERT, GraphCodeBERT, etc.
                "strategy": strategy,
                "labeled": m["Labeled"],
                "accuracy": m["Accuracy"],
                "ari": m["ARI"],
                "nmi": m["NMI"],
                "f1": m["F1"],
                "recall": m["Recall"]
            })

    df = pd.DataFrame(rows)

    # Append if file exists, else create
    try:
        existing = pd.read_csv(out_file)
        df = pd.concat([existing, df], ignore_index=True)
    except FileNotFoundError:
        pass

    df.to_csv(out_file, index=False)
    print(f"Results saved to {out_file}")


def plot_accuracy(model_name, results_dict, func, programming_language="java", save_csv=True):
    plt.figure(figsize=(10, 6))

    strategy_styles = {
        "random": ("Random", "o", "-."),
        "entropy": ("Entropy", "o", "--"),
        "uniform": ("Uniform", "s", "--"),
        "random_then_entropy_1": ("Random → Entropy (<100)", "D", "-"),
        "random_then_entropy_2": ("Random → Entropy (<250)", "P", "-")
    }

    for key, (label, marker, style) in strategy_styles.items():
        if key in results_dict:
            xs = [m["Labeled"] for m in results_dict[key]]
            ys = [m["Accuracy"] for m in results_dict[key]]
            plt.plot(xs, ys, label=label, marker=marker, linestyle=style)

    plt.title(f"{model_name} - Accuracy vs. Labeled Samples")
    plt.xlabel("# Labeled Samples")
    plt.ylabel("Accuracy")
    plt.ylim(0, 1.05)
    plt.xticks([50, 100, 250, 500])
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    
    save_path = f"{model_name}_{func}_{programming_language}.png"
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.close()
    
    if save_csv:
        save_results_to_csv(model_name, results_dict, func, programming_language)

## CodeBERT

### Label Propagation

In [10]:
codeBert_pp = run_active_learning("embeddings/cpp_CodeBERT_embeddings.pkl", func="label_propagation")

In [11]:
print_results("CodeBERT", codeBert_pp)

=== CodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.407 | ARI: 0.154 | NMI: 0.272 | F1: 0.402 | Recall: 0.414
  100 labels → Acc: 0.542 | ARI: 0.271 | NMI: 0.398 | F1: 0.539 | Recall: 0.550
  250 labels → Acc: 0.724 | ARI: 0.499 | NMI: 0.582 | F1: 0.721 | Recall: 0.724
  500 labels → Acc: 0.873 | ARI: 0.744 | NMI: 0.777 | F1: 0.871 | Recall: 0.872

Active Learning (Entropy-based):
  50 labels → Acc: 0.407 | ARI: 0.154 | NMI: 0.272 | F1: 0.402 | Recall: 0.414
  100 labels → Acc: 0.447 | ARI: 0.186 | NMI: 0.302 | F1: 0.445 | Recall: 0.452
  250 labels → Acc: 0.457 | ARI: 0.193 | NMI: 0.317 | F1: 0.454 | Recall: 0.461
  500 labels → Acc: 0.496 | ARI: 0.227 | NMI: 0.355 | F1: 0.493 | Recall: 0.502

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.407 | ARI: 0.154 | NMI: 0.272 | F1: 0.402 | Recall: 0.414
  100 labels → Acc: 0.476 | ARI: 0.208 | NMI: 0.333 | F1: 0.475 | Recall: 0.486
  250 labels → Acc: 0.543 | ARI: 0.275 | NMI: 0.407 | F

In [12]:
plot_accuracy("CodeBERT", codeBert_pp, func="label_propagation")

Results saved to results_c.csv


### Label Spreading

In [13]:
codeBert_sp= run_active_learning("embeddings/cpp_CodeBERT_embeddings.pkl", func="label_spreading")
print_results("CodeBERT", codeBert_sp)

=== CodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.391 | ARI: 0.139 | NMI: 0.261 | F1: 0.388 | Recall: 0.400
  100 labels → Acc: 0.531 | ARI: 0.258 | NMI: 0.389 | F1: 0.530 | Recall: 0.539
  250 labels → Acc: 0.720 | ARI: 0.491 | NMI: 0.575 | F1: 0.717 | Recall: 0.720
  500 labels → Acc: 0.868 | ARI: 0.731 | NMI: 0.771 | F1: 0.868 | Recall: 0.867

Active Learning (Entropy-based):
  50 labels → Acc: 0.391 | ARI: 0.139 | NMI: 0.261 | F1: 0.388 | Recall: 0.400
  100 labels → Acc: 0.435 | ARI: 0.176 | NMI: 0.300 | F1: 0.435 | Recall: 0.440
  250 labels → Acc: 0.463 | ARI: 0.190 | NMI: 0.336 | F1: 0.469 | Recall: 0.468
  500 labels → Acc: 0.501 | ARI: 0.214 | NMI: 0.367 | F1: 0.508 | Recall: 0.505

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.391 | ARI: 0.139 | NMI: 0.261 | F1: 0.388 | Recall: 0.400
  100 labels → Acc: 0.458 | ARI: 0.184 | NMI: 0.345 | F1: 0.468 | Recall: 0.469
  250 labels → Acc: 0.515 | ARI: 0.236 | NMI: 0.396 | F

In [14]:
plot_accuracy("CodeBERT", codeBert_sp, func="label_spreading")

Results saved to results_c.csv


## GraphCodeBERT

### Label Propagation

In [15]:
graphCodeBert_pp = run_active_learning("embeddings/cpp_GraphCodeBERT_embeddings.pkl", func="label_propagation")
print_results("GraphCodeBERT", graphCodeBert_pp)

=== GraphCodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.242 | ARI: 0.011 | NMI: 0.233 | F1: 0.284 | Recall: 0.255
  100 labels → Acc: 0.410 | ARI: 0.059 | NMI: 0.404 | F1: 0.486 | Recall: 0.418
  250 labels → Acc: 0.629 | ARI: 0.235 | NMI: 0.594 | F1: 0.693 | Recall: 0.631
  500 labels → Acc: 0.823 | ARI: 0.600 | NMI: 0.767 | F1: 0.847 | Recall: 0.824

Active Learning (Entropy-based):
  50 labels → Acc: 0.242 | ARI: 0.011 | NMI: 0.233 | F1: 0.284 | Recall: 0.255
  100 labels → Acc: 0.458 | ARI: 0.161 | NMI: 0.493 | F1: 0.461 | Recall: 0.430
  250 labels → Acc: 0.708 | ARI: 0.512 | NMI: 0.752 | F1: 0.692 | Recall: 0.689
  500 labels → Acc: 0.934 | ARI: 0.880 | NMI: 0.919 | F1: 0.931 | Recall: 0.930

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.242 | ARI: 0.011 | NMI: 0.233 | F1: 0.284 | Recall: 0.255
  100 labels → Acc: 0.464 | ARI: 0.083 | NMI: 0.456 | F1: 0.551 | Recall: 0.475
  250 labels → Acc: 0.791 | ARI: 0.508 | NMI: 0.75

In [16]:
plot_accuracy("GraphCodeBERT", graphCodeBert_pp, func="label_propagation")

Results saved to results_c.csv


### Label Spreading

In [17]:
graphCodeBert_sp = run_active_learning("embeddings/cpp_GraphCodeBERT_embeddings.pkl", func="label_spreading")
print_results("GraphCodeBERT", graphCodeBert_sp)

=== GraphCodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.242 | ARI: 0.011 | NMI: 0.233 | F1: 0.284 | Recall: 0.255
  100 labels → Acc: 0.410 | ARI: 0.059 | NMI: 0.404 | F1: 0.486 | Recall: 0.418
  250 labels → Acc: 0.628 | ARI: 0.237 | NMI: 0.591 | F1: 0.692 | Recall: 0.630
  500 labels → Acc: 0.823 | ARI: 0.600 | NMI: 0.767 | F1: 0.847 | Recall: 0.824

Active Learning (Entropy-based):
  50 labels → Acc: 0.242 | ARI: 0.011 | NMI: 0.233 | F1: 0.284 | Recall: 0.255
  100 labels → Acc: 0.458 | ARI: 0.161 | NMI: 0.493 | F1: 0.461 | Recall: 0.430
  250 labels → Acc: 0.708 | ARI: 0.512 | NMI: 0.752 | F1: 0.692 | Recall: 0.689
  500 labels → Acc: 0.934 | ARI: 0.880 | NMI: 0.919 | F1: 0.931 | Recall: 0.930

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.242 | ARI: 0.011 | NMI: 0.233 | F1: 0.284 | Recall: 0.255
  100 labels → Acc: 0.465 | ARI: 0.085 | NMI: 0.454 | F1: 0.551 | Recall: 0.476
  250 labels → Acc: 0.791 | ARI: 0.508 | NMI: 0.75

In [18]:
plot_accuracy("GraphCodeBERT", graphCodeBert_sp, func="label_spreading")

Results saved to results_c.csv


## UniXcoder

### Label Propogation

In [19]:
uniXcoder_pp = run_active_learning("embeddings/cpp_UniXcoder_embeddings.pkl", func="label_propagation")
print_results("UniXcoder", uniXcoder_pp)

=== UniXcoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.237 | ARI: 0.010 | NMI: 0.225 | F1: 0.275 | Recall: 0.249
  100 labels → Acc: 0.402 | ARI: 0.054 | NMI: 0.396 | F1: 0.476 | Recall: 0.411
  250 labels → Acc: 0.616 | ARI: 0.218 | NMI: 0.584 | F1: 0.683 | Recall: 0.619
  500 labels → Acc: 0.817 | ARI: 0.581 | NMI: 0.761 | F1: 0.843 | Recall: 0.819

Active Learning (Entropy-based):
  50 labels → Acc: 0.237 | ARI: 0.010 | NMI: 0.225 | F1: 0.275 | Recall: 0.249
  100 labels → Acc: 0.444 | ARI: 0.146 | NMI: 0.478 | F1: 0.444 | Recall: 0.415
  250 labels → Acc: 0.677 | ARI: 0.459 | NMI: 0.729 | F1: 0.660 | Recall: 0.656
  500 labels → Acc: 0.936 | ARI: 0.884 | NMI: 0.926 | F1: 0.932 | Recall: 0.932

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.237 | ARI: 0.010 | NMI: 0.225 | F1: 0.275 | Recall: 0.249
  100 labels → Acc: 0.458 | ARI: 0.079 | NMI: 0.452 | F1: 0.544 | Recall: 0.471
  250 labels → Acc: 0.782 | ARI: 0.493 | NMI: 0.742 | 

In [20]:
plot_accuracy("UniXcoder", uniXcoder_pp, func="label_propagation")

Results saved to results_c.csv


### Label Spreading

In [21]:
uniXcoder_sp = run_active_learning("embeddings/cpp_UniXcoder_embeddings.pkl", func="label_spreading")
print_results("UniXcoder", uniXcoder_sp)

=== UniXcoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.237 | ARI: 0.010 | NMI: 0.225 | F1: 0.275 | Recall: 0.249
  100 labels → Acc: 0.402 | ARI: 0.054 | NMI: 0.396 | F1: 0.477 | Recall: 0.411
  250 labels → Acc: 0.616 | ARI: 0.218 | NMI: 0.584 | F1: 0.683 | Recall: 0.619
  500 labels → Acc: 0.817 | ARI: 0.581 | NMI: 0.761 | F1: 0.843 | Recall: 0.819

Active Learning (Entropy-based):
  50 labels → Acc: 0.237 | ARI: 0.010 | NMI: 0.225 | F1: 0.275 | Recall: 0.249
  100 labels → Acc: 0.444 | ARI: 0.146 | NMI: 0.478 | F1: 0.444 | Recall: 0.415
  250 labels → Acc: 0.677 | ARI: 0.459 | NMI: 0.729 | F1: 0.660 | Recall: 0.656
  500 labels → Acc: 0.936 | ARI: 0.884 | NMI: 0.926 | F1: 0.932 | Recall: 0.932

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.237 | ARI: 0.010 | NMI: 0.225 | F1: 0.275 | Recall: 0.249
  100 labels → Acc: 0.458 | ARI: 0.079 | NMI: 0.452 | F1: 0.544 | Recall: 0.471
  250 labels → Acc: 0.782 | ARI: 0.493 | NMI: 0.742 | 

In [22]:
plot_accuracy("UniXcoder", uniXcoder_sp, func="label_spreading")

Results saved to results_c.csv


## CodeT5

### Label Propagation

In [23]:
codeT5_pp = run_active_learning("embeddings/cpp_CodeT5_embeddings.pkl", func="label_propagation")
print_results("CodeT5", codeT5_pp)

=== CodeT5 Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.473 | ARI: 0.214 | NMI: 0.348 | F1: 0.470 | Recall: 0.482
  100 labels → Acc: 0.606 | ARI: 0.344 | NMI: 0.464 | F1: 0.602 | Recall: 0.614
  250 labels → Acc: 0.782 | ARI: 0.586 | NMI: 0.654 | F1: 0.780 | Recall: 0.784
  500 labels → Acc: 0.906 | ARI: 0.804 | NMI: 0.827 | F1: 0.906 | Recall: 0.906

Active Learning (Entropy-based):
  50 labels → Acc: 0.473 | ARI: 0.214 | NMI: 0.348 | F1: 0.470 | Recall: 0.482
  100 labels → Acc: 0.494 | ARI: 0.235 | NMI: 0.364 | F1: 0.490 | Recall: 0.504
  250 labels → Acc: 0.520 | ARI: 0.256 | NMI: 0.389 | F1: 0.518 | Recall: 0.533
  500 labels → Acc: 0.567 | ARI: 0.303 | NMI: 0.430 | F1: 0.566 | Recall: 0.577

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.473 | ARI: 0.214 | NMI: 0.348 | F1: 0.470 | Recall: 0.482
  100 labels → Acc: 0.575 | ARI: 0.312 | NMI: 0.441 | F1: 0.577 | Recall: 0.589
  250 labels → Acc: 0.632 | ARI: 0.375 | NMI: 0.503 | F1:

In [24]:
plot_accuracy("CodeT5", codeT5_pp, func="label_propagation")

Results saved to results_c.csv


### Label Spreading

In [25]:
codeT5_sp = run_active_learning("embeddings/cpp_CodeT5_embeddings.pkl", func="label_spreading")
print_results("CodeT5", codeT5_sp)

=== CodeT5 Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.474 | ARI: 0.219 | NMI: 0.353 | F1: 0.471 | Recall: 0.484
  100 labels → Acc: 0.599 | ARI: 0.334 | NMI: 0.463 | F1: 0.598 | Recall: 0.608
  250 labels → Acc: 0.776 | ARI: 0.575 | NMI: 0.650 | F1: 0.776 | Recall: 0.778
  500 labels → Acc: 0.899 | ARI: 0.787 | NMI: 0.819 | F1: 0.900 | Recall: 0.900

Active Learning (Entropy-based):
  50 labels → Acc: 0.474 | ARI: 0.219 | NMI: 0.353 | F1: 0.471 | Recall: 0.484
  100 labels → Acc: 0.473 | ARI: 0.197 | NMI: 0.360 | F1: 0.479 | Recall: 0.478
  250 labels → Acc: 0.501 | ARI: 0.210 | NMI: 0.387 | F1: 0.521 | Recall: 0.510
  500 labels → Acc: 0.538 | ARI: 0.241 | NMI: 0.420 | F1: 0.559 | Recall: 0.547

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.474 | ARI: 0.219 | NMI: 0.353 | F1: 0.471 | Recall: 0.484
  100 labels → Acc: 0.530 | ARI: 0.249 | NMI: 0.414 | F1: 0.548 | Recall: 0.539
  250 labels → Acc: 0.587 | ARI: 0.311 | NMI: 0.467 | F1:

In [26]:
plot_accuracy("CodeT5", codeT5_sp, func="label_spreading")

Results saved to results_c.csv


## InCoder

### Label Propagation

In [27]:
InCoder_pp = run_active_learning("embeddings/cpp_InCoder_embeddings.pkl", func="label_propagation")
print_results("InCoder", InCoder_pp)

=== InCoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.618 | ARI: 0.377 | NMI: 0.525 | F1: 0.616 | Recall: 0.624
  100 labels → Acc: 0.737 | ARI: 0.527 | NMI: 0.627 | F1: 0.734 | Recall: 0.738
  250 labels → Acc: 0.870 | ARI: 0.748 | NMI: 0.783 | F1: 0.866 | Recall: 0.867
  500 labels → Acc: 0.940 | ARI: 0.877 | NMI: 0.887 | F1: 0.938 | Recall: 0.939

Active Learning (Entropy-based):
  50 labels → Acc: 0.618 | ARI: 0.377 | NMI: 0.525 | F1: 0.616 | Recall: 0.624
  100 labels → Acc: 0.680 | ARI: 0.461 | NMI: 0.579 | F1: 0.675 | Recall: 0.678
  250 labels → Acc: 0.707 | ARI: 0.489 | NMI: 0.596 | F1: 0.701 | Recall: 0.704
  500 labels → Acc: 0.760 | ARI: 0.562 | NMI: 0.654 | F1: 0.757 | Recall: 0.756

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.618 | ARI: 0.377 | NMI: 0.525 | F1: 0.616 | Recall: 0.624
  100 labels → Acc: 0.732 | ARI: 0.518 | NMI: 0.629 | F1: 0.730 | Recall: 0.735
  250 labels → Acc: 0.790 | ARI: 0.606 | NMI: 0.696 | F1

In [28]:
plot_accuracy("InCoder", InCoder_pp, func="label_propagation")

Results saved to results_c.csv


### Label Spreading

In [29]:
InCoder_sp = run_active_learning("embeddings/cpp_InCoder_embeddings.pkl", func="label_spreading")
print_results("InCoder", InCoder_sp)

=== InCoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.615 | ARI: 0.369 | NMI: 0.518 | F1: 0.616 | Recall: 0.622
  100 labels → Acc: 0.715 | ARI: 0.486 | NMI: 0.609 | F1: 0.718 | Recall: 0.720
  250 labels → Acc: 0.861 | ARI: 0.727 | NMI: 0.773 | F1: 0.860 | Recall: 0.860
  500 labels → Acc: 0.933 | ARI: 0.856 | NMI: 0.880 | F1: 0.934 | Recall: 0.934

Active Learning (Entropy-based):
  50 labels → Acc: 0.615 | ARI: 0.369 | NMI: 0.518 | F1: 0.616 | Recall: 0.622
  100 labels → Acc: 0.675 | ARI: 0.438 | NMI: 0.576 | F1: 0.675 | Recall: 0.675
  250 labels → Acc: 0.681 | ARI: 0.418 | NMI: 0.571 | F1: 0.690 | Recall: 0.678
  500 labels → Acc: 0.709 | ARI: 0.455 | NMI: 0.601 | F1: 0.719 | Recall: 0.708

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.615 | ARI: 0.369 | NMI: 0.518 | F1: 0.616 | Recall: 0.622
  100 labels → Acc: 0.687 | ARI: 0.442 | NMI: 0.590 | F1: 0.691 | Recall: 0.692
  250 labels → Acc: 0.716 | ARI: 0.474 | NMI: 0.617 | F1

In [30]:
plot_accuracy("InCoder", InCoder_sp, func="label_spreading")

Results saved to results_c.csv
