In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import (
    adjusted_rand_score,
    normalized_mutual_info_score,
    f1_score,
    accuracy_score,
    recall_score
)
import matplotlib.pyplot as plt

In [2]:
EMBEDDING_FILES = [
    "embeddings\java_CodeBERT_embeddings.pkl",
    "embeddings\java_GraphCodeBERT_embeddings.pkl",
    "embeddings\java_UniXcoder_embeddings.pkl",
    "embeddings\java_CodeT5_embeddings.pkl",
    "embeddings\java_InCoder_embeddings.pkl"
]
LABEL_COUNTS = [5, 10, 25, 50]
SEED = 42
REPEATS = 5 

In [3]:
def evaluate_label_propagation(embeddings, labels, num_labels_per_class, repeats=5, seed=SEED):
    unique_labels = np.unique(labels)
    n_classes = len(unique_labels)

    sss = StratifiedShuffleSplit(
        n_splits=repeats, train_size=num_labels_per_class * n_classes, random_state=seed
    )

    ari_scores, nmi_scores = [], []
    f1_scores, acc_scores, recall_scores = [], [], []

    for train_index, _ in sss.split(embeddings, labels):
        y_partial = -np.ones(len(labels))
        y_partial[train_index] = labels[train_index]

        model = LabelPropagation(kernel='rbf', gamma=20, max_iter=1000)
        model.fit(embeddings, y_partial)
        pred = model.transduction_

        ari_scores.append(adjusted_rand_score(labels, pred))
        nmi_scores.append(normalized_mutual_info_score(labels, pred))
        f1_scores.append(f1_score(labels, pred, average='macro'))
        acc_scores.append(accuracy_score(labels, pred))
        recall_scores.append(recall_score(labels, pred, average='macro'))

    # Store all metrics
    metrics = {
        'ARI': (np.mean(ari_scores), np.std(ari_scores)),
        'NMI': (np.mean(nmi_scores), np.std(nmi_scores)),
        'F1': (np.mean(f1_scores), np.std(f1_scores)),
        'Accuracy': (np.mean(acc_scores), np.std(acc_scores)),
        'Recall': (np.mean(recall_scores), np.std(recall_scores))
    }

    return metrics

In [4]:
results = {}

for file in EMBEDDING_FILES:
    df = pd.read_pickle(file)
    X = np.vstack(df["embedding"].values)
    y = pd.factorize(df["cluster"])[0]  # convert clusters to int labels

    model_name = file.split("_")[1]
    results[model_name] = {}

    print(f"\nEvaluating: {model_name}")

    for k in LABEL_COUNTS:
        print(f"→ {k} samples per cluster")
        metrics = evaluate_label_propagation(X, y, num_labels_per_class=k)

        # Unpack metrics into results dict
        results[model_name][k] = {
            metric: (mean, std) for metric, (mean, std) in metrics.items()
        }

        # Print summary
        print(" | ".join(
            f"{metric}: {mean:.3f} ± {std:.3f}"
            for metric, (mean, std) in results[model_name][k].items()
        ))


Evaluating: CodeBERT
→ 5 samples per cluster


KeyboardInterrupt: 

## active learning

In [19]:
from scipy.stats import entropy
from sklearn.metrics.pairwise import euclidean_distances

def simulate_active_learning(
    X, y, strategy="entropy", initial_labels_per_class=5,
        total_labeled_targets_lst = [100, 250, 500], seed=SEED
):
    rng = np.random.RandomState(seed)
    n_classes = len(np.unique(y))
    total_samples = len(y)

    # === Initialize labeled set ===
    labeled_mask = np.zeros(total_samples, dtype=bool)
    for cls in range(n_classes):
        cls_indices = np.where(y == cls)[0]
        labeled_indices = rng.choice(cls_indices, initial_labels_per_class, replace=False)
        labeled_mask[labeled_indices] = True

    y_partial = -np.ones(total_samples)
    y_partial[labeled_mask] = y[labeled_mask]

    all_metrics = []
    queried = labeled_mask.sum()
    batch_sizes = [10, 30, 50]
    batch_idx = 0
    
    while queried < total_labeled_targets_lst[-1]:
        batch_size = batch_sizes[min(batch_idx, len(batch_sizes) - 1)]
        model = LabelPropagation(kernel='rbf', gamma=20, max_iter=1000)
        model.fit(X, y_partial)
        probs = model.label_distributions_

        # === Sampling Strategy ===
        if strategy == "entropy":
            score = entropy(probs.T)
        elif strategy == "euclidean_distance":
            labeled_X = X[labeled_mask]
            dist = euclidean_distances(X, labeled_X)
            score = dist.min(axis=1)
        else:
            raise ValueError("Invalid strategy.")

        unlabeled_indices = np.where(~labeled_mask)[0]
        query_indices = unlabeled_indices[np.argsort(score[unlabeled_indices])[-batch_size:]]
        labeled_mask[query_indices] = True
        y_partial[query_indices] = y[query_indices]
        queried += batch_size
        
        if queried in total_labeled_targets_lst:
            batch_idx+=1
            # Final evaluation at current target
            pred = model.transduction_
            metrics = {
                "Labeled": queried,
                "ARI": adjusted_rand_score(y, pred),
                "NMI": normalized_mutual_info_score(y, pred),
                "Accuracy": accuracy_score(y, pred),
                "F1": f1_score(y, pred, average="macro"),
                "Recall": recall_score(y, pred, average="macro")
            }
            all_metrics.append(metrics)

    return all_metrics

In [22]:
total_labeled_targets = [100, 250, 500]

# CodeBERT
file = "embeddings\java_CodeBERT_embeddings.pkl"

df = pd.read_pickle(file)
X = np.vstack(df["embedding"].values)
y = pd.factorize(df["cluster"])[0]

model_name = file.split("_")[1]
results[model_name] = {}
    
print(f"\nEvaluating: {model_name}")
al_entropy = simulate_active_learning(X, y, strategy="entropy", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
al_distance = simulate_active_learning(X, y, strategy="euclidean_distance", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
    
for i, total_k in enumerate(total_labeled_targets):
    print(f"Active Learning with total samples of {total_k} (Entropy): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_entropy[i].items() if k != "Labeled"
    ))
    
    print(f"Active Learning with total samples of {total_k} (Euclidean Distance): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_distance[i].items() if k != "Labeled"
    ))


Evaluating: CodeBERT




Active Learning with total samples of 100 (Entropy): ARI: 0.332 | NMI: 0.471 | Accuracy: 0.582 | F1: 0.589 | Recall: 0.580
Active Learning with total samples of 100 (Euclidean Distance): ARI: 0.288 | NMI: 0.428 | Accuracy: 0.537 | F1: 0.544 | Recall: 0.535
Active Learning with total samples of 250 (Entropy): ARI: 0.337 | NMI: 0.464 | Accuracy: 0.596 | F1: 0.583 | Recall: 0.587
Active Learning with total samples of 250 (Euclidean Distance): ARI: 0.315 | NMI: 0.450 | Accuracy: 0.562 | F1: 0.565 | Recall: 0.562
Active Learning with total samples of 500 (Entropy): ARI: 0.367 | NMI: 0.483 | Accuracy: 0.625 | F1: 0.619 | Recall: 0.624
Active Learning with total samples of 500 (Euclidean Distance): ARI: 0.353 | NMI: 0.494 | Accuracy: 0.598 | F1: 0.602 | Recall: 0.600




In [23]:
# GraphCodeBERT
file = "embeddings\java_GraphCodeBERT_embeddings.pkl"

df = pd.read_pickle(file)
X = np.vstack(df["embedding"].values)
y = pd.factorize(df["cluster"])[0]

model_name = file.split("_")[1]
results[model_name] = {}
    
print(f"\nEvaluating: {model_name}")
al_entropy = simulate_active_learning(X, y, strategy="entropy", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
al_distance = simulate_active_learning(X, y, strategy="euclidean_distance", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
    
for i, total_k in enumerate(total_labeled_targets):
    print(f"Active Learning with total samples of {total_k} (Entropy): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_entropy[i].items() if k != "Labeled"
    ))
    
    print(f"Active Learning with total samples of {total_k} (Euclidean Distance): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_distance[i].items() if k != "Labeled"
    ))


Evaluating: GraphCodeBERT
Active Learning with total samples of 100 (Entropy): ARI: 0.143 | NMI: 0.515 | Accuracy: 0.497 | F1: 0.542 | Recall: 0.507
Active Learning with total samples of 100 (Euclidean Distance): ARI: 0.067 | NMI: 0.392 | Accuracy: 0.386 | F1: 0.424 | Recall: 0.388
Active Learning with total samples of 250 (Entropy): ARI: 0.570 | NMI: 0.784 | Accuracy: 0.797 | F1: 0.817 | Recall: 0.804
Active Learning with total samples of 250 (Euclidean Distance): ARI: 0.083 | NMI: 0.428 | Accuracy: 0.421 | F1: 0.469 | Recall: 0.426
Active Learning with total samples of 500 (Entropy): ARI: 0.967 | NMI: 0.973 | Accuracy: 0.981 | F1: 0.976 | Recall: 0.982
Active Learning with total samples of 500 (Euclidean Distance): ARI: 0.114 | NMI: 0.477 | Accuracy: 0.471 | F1: 0.521 | Recall: 0.474


In [24]:
# UniXcoder
file = "embeddings\java_UniXcoder_embeddings.pkl"

df = pd.read_pickle(file)
X = np.vstack(df["embedding"].values)
y = pd.factorize(df["cluster"])[0]

model_name = file.split("_")[1]
results[model_name] = {}
    
print(f"\nEvaluating: {model_name}")
al_entropy = simulate_active_learning(X, y, strategy="entropy", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
al_distance = simulate_active_learning(X, y, strategy="euclidean_distance", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
    
for i, total_k in enumerate(total_labeled_targets):
    print(f"Active Learning with total samples of {total_k} (Entropy): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_entropy[i].items() if k != "Labeled"
    ))
    
    print(f"Active Learning with total samples of {total_k} (Euclidean Distance): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_distance[i].items() if k != "Labeled"
    ))


Evaluating: UniXcoder
Active Learning with total samples of 100 (Entropy): ARI: 0.134 | NMI: 0.498 | Accuracy: 0.475 | F1: 0.509 | Recall: 0.470
Active Learning with total samples of 100 (Euclidean Distance): ARI: 0.045 | NMI: 0.345 | Accuracy: 0.347 | F1: 0.390 | Recall: 0.355
Active Learning with total samples of 250 (Entropy): ARI: 0.716 | NMI: 0.840 | Accuracy: 0.866 | F1: 0.867 | Recall: 0.856
Active Learning with total samples of 250 (Euclidean Distance): ARI: 0.053 | NMI: 0.372 | Accuracy: 0.374 | F1: 0.424 | Recall: 0.381
Active Learning with total samples of 500 (Entropy): ARI: 0.969 | NMI: 0.975 | Accuracy: 0.982 | F1: 0.977 | Recall: 0.983
Active Learning with total samples of 500 (Euclidean Distance): ARI: 0.077 | NMI: 0.428 | Accuracy: 0.434 | F1: 0.490 | Recall: 0.437


In [25]:
# CodeT5
file = "embeddings\java_CodeT5_embeddings.pkl"

df = pd.read_pickle(file)
X = np.vstack(df["embedding"].values)
y = pd.factorize(df["cluster"])[0]

model_name = file.split("_")[1]
results[model_name] = {}
    
print(f"\nEvaluating: {model_name}")
al_entropy = simulate_active_learning(X, y, strategy="entropy", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
al_distance = simulate_active_learning(X, y, strategy="euclidean_distance", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
    
for i, total_k in enumerate(total_labeled_targets):
    print(f"Active Learning with total samples of {total_k} (Entropy): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_entropy[i].items() if k != "Labeled"
    ))
    
    print(f"Active Learning with total samples of {total_k} (Euclidean Distance): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_distance[i].items() if k != "Labeled"
    ))


Evaluating: CodeT5




Active Learning with total samples of 100 (Entropy): ARI: 0.350 | NMI: 0.492 | Accuracy: 0.595 | F1: 0.587 | Recall: 0.594
Active Learning with total samples of 100 (Euclidean Distance): ARI: 0.343 | NMI: 0.473 | Accuracy: 0.598 | F1: 0.591 | Recall: 0.596
Active Learning with total samples of 250 (Entropy): ARI: 0.352 | NMI: 0.498 | Accuracy: 0.593 | F1: 0.585 | Recall: 0.593
Active Learning with total samples of 250 (Euclidean Distance): ARI: 0.372 | NMI: 0.509 | Accuracy: 0.627 | F1: 0.627 | Recall: 0.627
Active Learning with total samples of 500 (Entropy): ARI: 0.468 | NMI: 0.582 | Accuracy: 0.679 | F1: 0.668 | Recall: 0.675
Active Learning with total samples of 500 (Euclidean Distance): ARI: 0.437 | NMI: 0.565 | Accuracy: 0.671 | F1: 0.675 | Recall: 0.671


In [26]:
# InCoder
file = "embeddings\java_InCoder_embeddings.pkl"

df = pd.read_pickle(file)
X = np.vstack(df["embedding"].values)
y = pd.factorize(df["cluster"])[0]

model_name = file.split("_")[1]
results[model_name] = {}
    
print(f"\nEvaluating: {model_name}")
al_entropy = simulate_active_learning(X, y, strategy="entropy", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
al_distance = simulate_active_learning(X, y, strategy="euclidean_distance", total_labeled_targets_lst=total_labeled_targets, seed=SEED)
    
for i, total_k in enumerate(total_labeled_targets):
    print(f"Active Learning with total samples of {total_k} (Entropy): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_entropy[i].items() if k != "Labeled"
    ))
    
    print(f"Active Learning with total samples of {total_k} (Euclidean Distance): " + " | ".join(
        f"{k}: {v:.3f}" for k, v in al_distance[i].items() if k != "Labeled"
    ))


Evaluating: InCoder
Active Learning with total samples of 100 (Entropy): ARI: 0.537 | NMI: 0.651 | Accuracy: 0.744 | F1: 0.736 | Recall: 0.742
Active Learning with total samples of 100 (Euclidean Distance): ARI: 0.534 | NMI: 0.649 | Accuracy: 0.737 | F1: 0.728 | Recall: 0.733
Active Learning with total samples of 250 (Entropy): ARI: 0.589 | NMI: 0.686 | Accuracy: 0.777 | F1: 0.772 | Recall: 0.776
Active Learning with total samples of 250 (Euclidean Distance): ARI: 0.551 | NMI: 0.665 | Accuracy: 0.748 | F1: 0.739 | Recall: 0.746
Active Learning with total samples of 500 (Entropy): ARI: 0.667 | NMI: 0.740 | Accuracy: 0.823 | F1: 0.816 | Recall: 0.821
Active Learning with total samples of 500 (Euclidean Distance): ARI: 0.626 | NMI: 0.720 | Accuracy: 0.788 | F1: 0.777 | Recall: 0.780
