In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import accuracy_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, recall_score
from scipy.stats import entropy
from collections import defaultdict
import warnings
from sklearn.exceptions import ConvergenceWarning

# Add this near the top of your script or notebook
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Function

In [7]:
total_labeled_targets=[50, 100, 250, 500]

def label_propagation(X, y, strategy="uniform", 
                             initial_labels_per_class=5, seed=42, repeats=5):
    total_labeled_targets = [50, 100, 250, 500]
    n_classes = len(np.unique(y))
    total_samples = len(y)

    all_runs_metrics = defaultdict(list)

    for r in range(repeats):
        rng = np.random.RandomState(seed + r)
        labeled_mask = np.zeros(total_samples, dtype=bool)

        for cls in range(n_classes):
            cls_indices = np.where(y == cls)[0]
            labeled_indices = rng.choice(cls_indices, initial_labels_per_class, replace=False)
            labeled_mask[labeled_indices] = True

        y_partial = -np.ones(total_samples)
        y_partial[labeled_mask] = y[labeled_mask]
        queried = labeled_mask.sum()
        batch_sizes = [10, 30, 50]
        batch_idx = 0
        evaluated_targets = set()

        while queried <= total_labeled_targets[-1]:
            batch_size = batch_sizes[min(batch_idx, len(batch_sizes) - 1)]
            model = LabelPropagation(kernel='rbf', gamma=20, max_iter=1000)
            model.fit(X, y_partial)
            probs = model.label_distributions_
            score = entropy(probs.T)
            unlabeled_indices = np.where(~labeled_mask)[0]

            # === Evaluation ===
            for target in total_labeled_targets:
                if queried >= target and target not in evaluated_targets:
                    pred = model.transduction_
                    all_runs_metrics[target, "Accuracy"].append(accuracy_score(y, pred))
                    all_runs_metrics[target, "ARI"].append(adjusted_rand_score(y, pred))
                    all_runs_metrics[target, "NMI"].append(normalized_mutual_info_score(y, pred))
                    all_runs_metrics[target, "F1"].append(f1_score(y, pred, average="macro"))
                    all_runs_metrics[target, "Recall"].append(recall_score(y, pred, average="macro"))
                    evaluated_targets.add(target)
                    batch_idx += 1

            if queried > total_labeled_targets[-1]:
                break

            # === Query selection ===
            if strategy == "entropy":
                query_indices = unlabeled_indices[np.argsort(score[unlabeled_indices])[-batch_size:]]

            elif strategy == "uniform":
                query_indices = []
                per_class_quota = batch_size // n_classes
                for cls in range(n_classes):
                    cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                    if len(cls_unlabeled) == 0:
                        continue
                    cls_entropy_scores = score[cls_unlabeled]
                    top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                    query_indices.extend(top_cls_indices)
                remaining = batch_size - len(query_indices)
                if remaining > 0:
                    remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                    top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                    query_indices.extend(top_remaining)

            elif strategy == "random":
                query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
            
            elif strategy == "random_then_entropy_1":
                if queried < 100:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)
                    
            elif strategy == "random_then_entropy_2":
                if queried < 250:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)
                
            else:
                print(strategy)
                raise ValueError("Invalid strategy.")

            labeled_mask[query_indices] = True
            y_partial[query_indices] = y[query_indices]
            queried += len(query_indices)

    final_results = []
    for target in total_labeled_targets:
        metrics = {
            "Labeled": target,
            "Accuracy": np.mean(all_runs_metrics[(target, "Accuracy")]),
            "ARI": np.mean(all_runs_metrics[(target, "ARI")]),
            "NMI": np.mean(all_runs_metrics[(target, "NMI")]),
            "F1": np.mean(all_runs_metrics[(target, "F1")]),
            "Recall": np.mean(all_runs_metrics[(target, "Recall")]),
        }
        final_results.append(metrics)

    return final_results

In [8]:
def label_spreading(X, y, strategy="uniform", 
                                             initial_labels_per_class=5, seed=42, repeats=5):
    total_labeled_targets = [50, 100, 250, 500]
    n_classes = len(np.unique(y))
    total_samples = len(y)

    all_runs_metrics = defaultdict(list)

    for r in range(repeats):
        rng = np.random.RandomState(seed + r)
        labeled_mask = np.zeros(total_samples, dtype=bool)

        for cls in range(n_classes):
            cls_indices = np.where(y == cls)[0]
            labeled_indices = rng.choice(cls_indices, initial_labels_per_class, replace=False)
            labeled_mask[labeled_indices] = True

        y_partial = -np.ones(total_samples)
        y_partial[labeled_mask] = y[labeled_mask]
        queried = labeled_mask.sum()
        batch_sizes = [10, 30, 50]
        batch_idx = 0
        evaluated_targets = set()

        while queried <= total_labeled_targets[-1]:
            batch_size = batch_sizes[min(batch_idx, len(batch_sizes) - 1)]
            model = LabelSpreading(kernel='rbf', gamma=20, max_iter=1000)
            model.fit(X, y_partial)
            probs = model.label_distributions_
            score = entropy(probs.T)
            unlabeled_indices = np.where(~labeled_mask)[0]

            for target in total_labeled_targets:
                if queried >= target and target not in evaluated_targets:
                    pred = model.transduction_
                    all_runs_metrics[target, "Accuracy"].append(accuracy_score(y, pred))
                    all_runs_metrics[target, "ARI"].append(adjusted_rand_score(y, pred))
                    all_runs_metrics[target, "NMI"].append(normalized_mutual_info_score(y, pred))
                    all_runs_metrics[target, "F1"].append(f1_score(y, pred, average="macro"))
                    all_runs_metrics[target, "Recall"].append(recall_score(y, pred, average="macro"))
                    evaluated_targets.add(target)
                    batch_idx += 1

            if queried > total_labeled_targets[-1]:
                break

            if strategy == "entropy":
                query_indices = unlabeled_indices[np.argsort(score[unlabeled_indices])[-batch_size:]]

            elif strategy == "uniform":
                query_indices = []
                per_class_quota = batch_size // n_classes
                for cls in range(n_classes):
                    cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                    if len(cls_unlabeled) == 0:
                        continue
                    cls_entropy_scores = score[cls_unlabeled]
                    top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                    query_indices.extend(top_cls_indices)
                remaining = batch_size - len(query_indices)
                if remaining > 0:
                    remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                    top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                    query_indices.extend(top_remaining)

            elif strategy == "random":
                query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)

            elif strategy == "random_then_entropy_1":
                if queried < 100:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)

            elif strategy == "random_then_entropy_2":
                if queried < 250:
                    query_indices = rng.choice(unlabeled_indices, batch_size, replace=False)
                else:
                    query_indices = []
                    per_class_quota = batch_size // n_classes
                    for cls in range(n_classes):
                        cls_unlabeled = np.setdiff1d(np.where(y == cls)[0], np.where(labeled_mask)[0])
                        if len(cls_unlabeled) == 0:
                            continue
                        cls_entropy_scores = score[cls_unlabeled]
                        top_cls_indices = cls_unlabeled[np.argsort(cls_entropy_scores)[-min(per_class_quota, len(cls_unlabeled)):]]
                        query_indices.extend(top_cls_indices)
                    remaining = batch_size - len(query_indices)
                    if remaining > 0:
                        remaining_indices = np.setdiff1d(unlabeled_indices, query_indices)
                        top_remaining = remaining_indices[np.argsort(score[remaining_indices])[-remaining:]]
                        query_indices.extend(top_remaining)

            else:
                raise ValueError(f"Invalid strategy: {strategy}")

            labeled_mask[query_indices] = True
            y_partial[query_indices] = y[query_indices]
            queried += len(query_indices)

    final_results = []
    for target in total_labeled_targets:
        metrics = {
            "Labeled": target,
            "Accuracy": np.mean(all_runs_metrics[(target, "Accuracy")]),
            "ARI": np.mean(all_runs_metrics[(target, "ARI")]),
            "NMI": np.mean(all_runs_metrics[(target, "NMI")]),
            "F1": np.mean(all_runs_metrics[(target, "F1")]),
            "Recall": np.mean(all_runs_metrics[(target, "Recall")]),
        }
        final_results.append(metrics)

    return final_results

In [None]:
def run_active_learning(file_path, repeats=5, func="label_propagation"):
    df = pd.read_pickle(file_path)
    X = np.vstack(df["embedding"].values)
    y = pd.factorize(df["cluster"])[0]

    strategies = [
        ("random", "Random Sampling"),
        ("entropy", "Entropy-based"),
        ("uniform", "Class-balance Sampling"),
        ("random_then_entropy_1", "Random → Entropy (start <100)"),
        ("random_then_entropy_2", "Random → Entropy (start <250)")
    ]

    # Select the active learning simulation function
    if func == "label_propagation":
        simulate = label_propagation
    elif func == "label_spreading":
        simulate = label_spreading
    else:
        raise ValueError(f"Invalid func: {func}. Must be 'label_propagation' or 'label_spreading'.")

    results = {}
    for strategy_key, _ in strategies:
        results[strategy_key] = simulate(X, y, strategy=strategy_key, repeats=repeats)

    return results



def print_results(model_name, results_dict):
    print(f"=== {model_name} Accuracy Results ===\n")

    strategy_names = {
        "random": "Random Sampling",
        "entropy": "Entropy-based",
        "uniform": "Class-balance Sampling",
        "random_then_entropy_1": "Random → Entropy (start <100)",
        "random_then_entropy_2": "Random → Entropy (start <250)"
    }

    def print_al_metrics(name, metrics):
        print(f"\nActive Learning ({name}):")
        for m in metrics:
            print(f"  {m['Labeled']} labels → Acc: {m['Accuracy']:.3f} | "
                  f"ARI: {m['ARI']:.3f} | NMI: {m['NMI']:.3f} | "
                  f"F1: {m['F1']:.3f} | Recall: {m['Recall']:.3f}")

    for key, name in strategy_names.items():
        if key in results_dict:
            print_al_metrics(name, results_dict[key])

def save_results_to_csv(model_name, results_dict, func, programming_language="java", out_file="results_java.csv"):
    """
    Save active learning results into a CSV file with a long-format structure.
    """
    rows = []
    for strategy, metrics in results_dict.items():
        for m in metrics:
            rows.append({
                "programming_language": programming_language,
                "func": func,   # label_propagation or label_spreading
                "embedding": model_name,  # CodeBERT, GraphCodeBERT, etc.
                "strategy": strategy,
                "labeled": m["Labeled"],
                "accuracy": m["Accuracy"],
                "ari": m["ARI"],
                "nmi": m["NMI"],
                "f1": m["F1"],
                "recall": m["Recall"]
            })

    df = pd.DataFrame(rows)

    # Append if file exists, else create
    try:
        existing = pd.read_csv(out_file)
        df = pd.concat([existing, df], ignore_index=True)
    except FileNotFoundError:
        pass

    df.to_csv(out_file, index=False)
    print(f"Results saved to {out_file}")


def plot_accuracy(model_name, results_dict, func, programming_language="java", save_csv=True):
    plt.figure(figsize=(10, 6))

    strategy_styles = {
        "random": ("Random", "o", "-."),
        "entropy": ("Entropy", "o", "--"),
        "uniform": ("Uniform", "s", "--"),
        "random_then_entropy_1": ("Random → Entropy (<100)", "D", "-"),
        "random_then_entropy_2": ("Random → Entropy (<250)", "P", "-")
    }

    for key, (label, marker, style) in strategy_styles.items():
        if key in results_dict:
            xs = [m["Labeled"] for m in results_dict[key]]
            ys = [m["Accuracy"] for m in results_dict[key]]
            plt.plot(xs, ys, label=label, marker=marker, linestyle=style)

    plt.title(f"{model_name} - Accuracy vs. Labeled Samples")
    plt.xlabel("# Labeled Samples")
    plt.ylabel("Accuracy")
    plt.ylim(0, 1.05)
    plt.xticks([50, 100, 250, 500])
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    
    save_path = f"{model_name}_{func}_{programming_language}.png"
    plt.savefig(save_path, dpi=300, bbox_inches="tight")
    plt.close()
    
    if save_csv:
        save_results_to_csv(model_name, results_dict, func, programming_language)

## CodeBERT

### Label Propagation

In [10]:
codeBert_pp = run_active_learning("embeddings/java_CodeBERT_embeddings.pkl", func="label_propagation")

In [11]:
print_results("CodeBERT", codeBert_pp)

=== CodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.499 | ARI: 0.258 | NMI: 0.386 | F1: 0.501 | Recall: 0.497
  100 labels → Acc: 0.655 | ARI: 0.416 | NMI: 0.527 | F1: 0.656 | Recall: 0.659
  250 labels → Acc: 0.795 | ARI: 0.614 | NMI: 0.679 | F1: 0.792 | Recall: 0.792
  500 labels → Acc: 0.908 | ARI: 0.817 | NMI: 0.833 | F1: 0.901 | Recall: 0.902

Active Learning (Entropy-based):
  50 labels → Acc: 0.499 | ARI: 0.258 | NMI: 0.386 | F1: 0.501 | Recall: 0.497
  100 labels → Acc: 0.556 | ARI: 0.312 | NMI: 0.437 | F1: 0.557 | Recall: 0.555
  250 labels → Acc: 0.585 | ARI: 0.340 | NMI: 0.455 | F1: 0.579 | Recall: 0.582
  500 labels → Acc: 0.627 | ARI: 0.387 | NMI: 0.498 | F1: 0.621 | Recall: 0.624

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.499 | ARI: 0.258 | NMI: 0.386 | F1: 0.501 | Recall: 0.497
  100 labels → Acc: 0.600 | ARI: 0.350 | NMI: 0.466 | F1: 0.595 | Recall: 0.597
  250 labels → Acc: 0.672 | ARI: 0.437 | NMI: 0.554 | F

In [12]:
plot_accuracy("CodeBERT", codeBert_pp, func="label_propagation")

Results saved to results.csv


### Label Spreading

In [13]:
codeBert_sp= run_active_learning("embeddings/java_CodeBERT_embeddings.pkl", func="label_spreading")
print_results("CodeBERT", codeBert_sp)

=== CodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.472 | ARI: 0.235 | NMI: 0.373 | F1: 0.480 | Recall: 0.475
  100 labels → Acc: 0.623 | ARI: 0.376 | NMI: 0.488 | F1: 0.621 | Recall: 0.630
  250 labels → Acc: 0.779 | ARI: 0.590 | NMI: 0.655 | F1: 0.773 | Recall: 0.778
  500 labels → Acc: 0.891 | ARI: 0.789 | NMI: 0.814 | F1: 0.884 | Recall: 0.886

Active Learning (Entropy-based):
  50 labels → Acc: 0.472 | ARI: 0.235 | NMI: 0.373 | F1: 0.480 | Recall: 0.475
  100 labels → Acc: 0.514 | ARI: 0.252 | NMI: 0.409 | F1: 0.520 | Recall: 0.520
  250 labels → Acc: 0.521 | ARI: 0.242 | NMI: 0.425 | F1: 0.529 | Recall: 0.529
  500 labels → Acc: 0.548 | ARI: 0.266 | NMI: 0.442 | F1: 0.553 | Recall: 0.554

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.472 | ARI: 0.235 | NMI: 0.373 | F1: 0.480 | Recall: 0.475
  100 labels → Acc: 0.509 | ARI: 0.213 | NMI: 0.402 | F1: 0.521 | Recall: 0.521
  250 labels → Acc: 0.576 | ARI: 0.285 | NMI: 0.477 | F

In [14]:
plot_accuracy("CodeBERT", codeBert_sp, func="label_spreading")

Results saved to results.csv


## GraphCodeBERT

### Label Propagation

In [15]:
graphCodeBert_pp = run_active_learning("embeddings/java_GraphCodeBERT_embeddings.pkl", func="label_propagation")
print_results("GraphCodeBERT", graphCodeBert_pp)

=== GraphCodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.373 | ARI: 0.064 | NMI: 0.382 | F1: 0.406 | Recall: 0.371
  100 labels → Acc: 0.531 | ARI: 0.150 | NMI: 0.527 | F1: 0.591 | Recall: 0.539
  250 labels → Acc: 0.704 | ARI: 0.347 | NMI: 0.669 | F1: 0.749 | Recall: 0.703
  500 labels → Acc: 0.858 | ARI: 0.668 | NMI: 0.807 | F1: 0.870 | Recall: 0.851

Active Learning (Entropy-based):
  50 labels → Acc: 0.373 | ARI: 0.064 | NMI: 0.382 | F1: 0.406 | Recall: 0.371
  100 labels → Acc: 0.574 | ARI: 0.255 | NMI: 0.613 | F1: 0.591 | Recall: 0.566
  250 labels → Acc: 0.849 | ARI: 0.714 | NMI: 0.849 | F1: 0.842 | Recall: 0.834
  500 labels → Acc: 0.984 | ARI: 0.971 | NMI: 0.976 | F1: 0.979 | Recall: 0.979

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.373 | ARI: 0.064 | NMI: 0.382 | F1: 0.406 | Recall: 0.371
  100 labels → Acc: 0.591 | ARI: 0.207 | NMI: 0.583 | F1: 0.651 | Recall: 0.599
  250 labels → Acc: 0.861 | ARI: 0.678 | NMI: 0.81

In [16]:
plot_accuracy("GraphCodeBERT", graphCodeBert_pp, func="label_propagation")

Results saved to results.csv


### Label Spreading

In [17]:
graphCodeBert_sp = run_active_learning("embeddings/java_GraphCodeBERT_embeddings.pkl", func="label_spreading")
print_results("GraphCodeBERT", graphCodeBert_sp)

=== GraphCodeBERT Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.373 | ARI: 0.064 | NMI: 0.382 | F1: 0.406 | Recall: 0.371
  100 labels → Acc: 0.531 | ARI: 0.150 | NMI: 0.527 | F1: 0.591 | Recall: 0.539
  250 labels → Acc: 0.704 | ARI: 0.347 | NMI: 0.669 | F1: 0.749 | Recall: 0.703
  500 labels → Acc: 0.858 | ARI: 0.668 | NMI: 0.807 | F1: 0.870 | Recall: 0.851

Active Learning (Entropy-based):
  50 labels → Acc: 0.373 | ARI: 0.064 | NMI: 0.382 | F1: 0.406 | Recall: 0.371
  100 labels → Acc: 0.574 | ARI: 0.255 | NMI: 0.613 | F1: 0.591 | Recall: 0.566
  250 labels → Acc: 0.849 | ARI: 0.714 | NMI: 0.849 | F1: 0.842 | Recall: 0.834
  500 labels → Acc: 0.984 | ARI: 0.971 | NMI: 0.976 | F1: 0.979 | Recall: 0.979

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.373 | ARI: 0.064 | NMI: 0.382 | F1: 0.406 | Recall: 0.371
  100 labels → Acc: 0.591 | ARI: 0.207 | NMI: 0.583 | F1: 0.651 | Recall: 0.599
  250 labels → Acc: 0.861 | ARI: 0.678 | NMI: 0.81

In [18]:
plot_accuracy("GraphCodeBERT", graphCodeBert_sp, func="label_spreading")

Results saved to results.csv


## UniXcoder

### Label Propogation

In [19]:
uniXcoder_pp = run_active_learning("embeddings/java_UniXcoder_embeddings.pkl", func="label_propagation")
print_results("UniXcoder", uniXcoder_pp)

=== UniXcoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.341 | ARI: 0.045 | NMI: 0.339 | F1: 0.375 | Recall: 0.340
  100 labels → Acc: 0.506 | ARI: 0.122 | NMI: 0.497 | F1: 0.569 | Recall: 0.513
  250 labels → Acc: 0.688 | ARI: 0.315 | NMI: 0.650 | F1: 0.736 | Recall: 0.685
  500 labels → Acc: 0.851 | ARI: 0.650 | NMI: 0.799 | F1: 0.865 | Recall: 0.845

Active Learning (Entropy-based):
  50 labels → Acc: 0.341 | ARI: 0.045 | NMI: 0.339 | F1: 0.375 | Recall: 0.340
  100 labels → Acc: 0.500 | ARI: 0.170 | NMI: 0.536 | F1: 0.525 | Recall: 0.499
  250 labels → Acc: 0.760 | ARI: 0.526 | NMI: 0.772 | F1: 0.773 | Recall: 0.759
  500 labels → Acc: 0.970 | ARI: 0.947 | NMI: 0.964 | F1: 0.965 | Recall: 0.968

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.341 | ARI: 0.045 | NMI: 0.339 | F1: 0.375 | Recall: 0.340
  100 labels → Acc: 0.570 | ARI: 0.181 | NMI: 0.561 | F1: 0.633 | Recall: 0.578
  250 labels → Acc: 0.851 | ARI: 0.651 | NMI: 0.803 | 

In [20]:
plot_accuracy("UniXcoder", uniXcoder_pp, func="label_propagation")

Results saved to results.csv


### Label Spreading

In [21]:
uniXcoder_sp = run_active_learning("embeddings/java_UniXcoder_embeddings.pkl", func="label_spreading")
print_results("UniXcoder", uniXcoder_sp)

=== UniXcoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.341 | ARI: 0.045 | NMI: 0.339 | F1: 0.375 | Recall: 0.340
  100 labels → Acc: 0.508 | ARI: 0.123 | NMI: 0.501 | F1: 0.573 | Recall: 0.515
  250 labels → Acc: 0.688 | ARI: 0.315 | NMI: 0.650 | F1: 0.736 | Recall: 0.685
  500 labels → Acc: 0.851 | ARI: 0.650 | NMI: 0.799 | F1: 0.865 | Recall: 0.845

Active Learning (Entropy-based):
  50 labels → Acc: 0.341 | ARI: 0.045 | NMI: 0.339 | F1: 0.375 | Recall: 0.340
  100 labels → Acc: 0.500 | ARI: 0.170 | NMI: 0.536 | F1: 0.525 | Recall: 0.499
  250 labels → Acc: 0.760 | ARI: 0.526 | NMI: 0.772 | F1: 0.773 | Recall: 0.759
  500 labels → Acc: 0.970 | ARI: 0.947 | NMI: 0.964 | F1: 0.965 | Recall: 0.968

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.341 | ARI: 0.045 | NMI: 0.339 | F1: 0.375 | Recall: 0.340
  100 labels → Acc: 0.570 | ARI: 0.181 | NMI: 0.561 | F1: 0.633 | Recall: 0.578
  250 labels → Acc: 0.851 | ARI: 0.651 | NMI: 0.803 | 

In [22]:
plot_accuracy("UniXcoder", uniXcoder_sp, func="label_spreading")

Results saved to results.csv


## CodeT5

### Label Propagation

In [23]:
codeT5_pp = run_active_learning("embeddings/java_CodeT5_embeddings.pkl", func="label_propagation")
print_results("CodeT5", codeT5_pp)

=== CodeT5 Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.539 | ARI: 0.297 | NMI: 0.418 | F1: 0.535 | Recall: 0.538
  100 labels → Acc: 0.708 | ARI: 0.495 | NMI: 0.584 | F1: 0.700 | Recall: 0.710
  250 labels → Acc: 0.827 | ARI: 0.675 | NMI: 0.724 | F1: 0.818 | Recall: 0.821
  500 labels → Acc: 0.925 | ARI: 0.853 | NMI: 0.864 | F1: 0.917 | Recall: 0.918

Active Learning (Entropy-based):
  50 labels → Acc: 0.539 | ARI: 0.297 | NMI: 0.418 | F1: 0.535 | Recall: 0.538
  100 labels → Acc: 0.568 | ARI: 0.333 | NMI: 0.461 | F1: 0.562 | Recall: 0.569
  250 labels → Acc: 0.593 | ARI: 0.362 | NMI: 0.493 | F1: 0.582 | Recall: 0.593
  500 labels → Acc: 0.656 | ARI: 0.438 | NMI: 0.552 | F1: 0.646 | Recall: 0.654

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.539 | ARI: 0.297 | NMI: 0.418 | F1: 0.535 | Recall: 0.538
  100 labels → Acc: 0.624 | ARI: 0.396 | NMI: 0.513 | F1: 0.614 | Recall: 0.623
  250 labels → Acc: 0.696 | ARI: 0.485 | NMI: 0.601 | F1:

In [24]:
plot_accuracy("CodeT5", codeT5_pp, func="label_propagation")

Results saved to results.csv


### Label Spreading

In [25]:
codeT5_sp = run_active_learning("embeddings/java_CodeT5_embeddings.pkl", func="label_spreading")
print_results("CodeT5", codeT5_sp)

=== CodeT5 Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.530 | ARI: 0.287 | NMI: 0.423 | F1: 0.529 | Recall: 0.532
  100 labels → Acc: 0.677 | ARI: 0.440 | NMI: 0.556 | F1: 0.673 | Recall: 0.681
  250 labels → Acc: 0.809 | ARI: 0.641 | NMI: 0.704 | F1: 0.803 | Recall: 0.806
  500 labels → Acc: 0.914 | ARI: 0.834 | NMI: 0.852 | F1: 0.905 | Recall: 0.907

Active Learning (Entropy-based):
  50 labels → Acc: 0.530 | ARI: 0.287 | NMI: 0.423 | F1: 0.529 | Recall: 0.532
  100 labels → Acc: 0.517 | ARI: 0.246 | NMI: 0.418 | F1: 0.518 | Recall: 0.522
  250 labels → Acc: 0.524 | ARI: 0.237 | NMI: 0.416 | F1: 0.523 | Recall: 0.529
  500 labels → Acc: 0.567 | ARI: 0.286 | NMI: 0.462 | F1: 0.566 | Recall: 0.570

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.530 | ARI: 0.287 | NMI: 0.423 | F1: 0.529 | Recall: 0.532
  100 labels → Acc: 0.525 | ARI: 0.243 | NMI: 0.434 | F1: 0.531 | Recall: 0.532
  250 labels → Acc: 0.605 | ARI: 0.318 | NMI: 0.522 | F1:

In [26]:
plot_accuracy("CodeT5", codeT5_sp, func="label_spreading")

Results saved to results.csv


## InCoder

### Label Propagation

In [27]:
InCoder_pp = run_active_learning("embeddings/java_InCoder_embeddings.pkl", func="label_propagation")
print_results("InCoder", InCoder_pp)

=== InCoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.700 | ARI: 0.521 | NMI: 0.630 | F1: 0.687 | Recall: 0.692
  100 labels → Acc: 0.816 | ARI: 0.671 | NMI: 0.741 | F1: 0.806 | Recall: 0.815
  250 labels → Acc: 0.899 | ARI: 0.808 | NMI: 0.837 | F1: 0.891 | Recall: 0.894
  500 labels → Acc: 0.958 | ARI: 0.917 | NMI: 0.919 | F1: 0.953 | Recall: 0.955

Active Learning (Entropy-based):
  50 labels → Acc: 0.700 | ARI: 0.521 | NMI: 0.630 | F1: 0.687 | Recall: 0.692
  100 labels → Acc: 0.740 | ARI: 0.560 | NMI: 0.664 | F1: 0.729 | Recall: 0.734
  250 labels → Acc: 0.770 | ARI: 0.602 | NMI: 0.698 | F1: 0.761 | Recall: 0.767
  500 labels → Acc: 0.814 | ARI: 0.666 | NMI: 0.745 | F1: 0.805 | Recall: 0.814

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.700 | ARI: 0.521 | NMI: 0.630 | F1: 0.687 | Recall: 0.692
  100 labels → Acc: 0.814 | ARI: 0.669 | NMI: 0.740 | F1: 0.803 | Recall: 0.810
  250 labels → Acc: 0.848 | ARI: 0.720 | NMI: 0.794 | F1

In [28]:
plot_accuracy("InCoder", InCoder_pp, func="label_propagation")

Results saved to results.csv


### Label Spreading

In [29]:
InCoder_sp = run_active_learning("embeddings/java_InCoder_embeddings.pkl", func="label_spreading")
print_results("InCoder", InCoder_sp)

=== InCoder Accuracy Results ===


Active Learning (Random Sampling):
  50 labels → Acc: 0.658 | ARI: 0.447 | NMI: 0.600 | F1: 0.658 | Recall: 0.651
  100 labels → Acc: 0.782 | ARI: 0.616 | NMI: 0.714 | F1: 0.776 | Recall: 0.784
  250 labels → Acc: 0.868 | ARI: 0.749 | NMI: 0.801 | F1: 0.861 | Recall: 0.867
  500 labels → Acc: 0.942 | ARI: 0.886 | NMI: 0.898 | F1: 0.936 | Recall: 0.940

Active Learning (Entropy-based):
  50 labels → Acc: 0.658 | ARI: 0.447 | NMI: 0.600 | F1: 0.658 | Recall: 0.651
  100 labels → Acc: 0.687 | ARI: 0.475 | NMI: 0.650 | F1: 0.689 | Recall: 0.688
  250 labels → Acc: 0.712 | ARI: 0.527 | NMI: 0.674 | F1: 0.705 | Recall: 0.716
  500 labels → Acc: 0.743 | ARI: 0.574 | NMI: 0.701 | F1: 0.733 | Recall: 0.747

Active Learning (Class-balance Sampling):
  50 labels → Acc: 0.658 | ARI: 0.447 | NMI: 0.600 | F1: 0.658 | Recall: 0.651
  100 labels → Acc: 0.701 | ARI: 0.500 | NMI: 0.665 | F1: 0.697 | Recall: 0.706
  250 labels → Acc: 0.770 | ARI: 0.587 | NMI: 0.717 | F1

In [30]:
plot_accuracy("InCoder", InCoder_sp, func="label_spreading")

Results saved to results.csv
