In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/871 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/469 [00:00<?, ?B/s]



KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (3).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: singleton (3).java, Predicted Label: 0, True Label: 1
File: nons (53).java, Predicted Label: 1, True Label: 0
File: singleton (2

In [None]:
#Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 1, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (3).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, P

In [None]:
#Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 1, True Label: 0
File: nons (18).java, Predicted Label: 1, True Label: 0
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (42).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (7).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (41).java, Predicted Label: 0, True Label: 0
File: nons (1).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predi

In [None]:
#Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 1, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (41).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: nons (22).java, Predicted Label: 1, True Label: 0
File: singleton (3).java, Predicted Label: 1, True Label: 1
File: singleton (23).java, Predicted Label: 1, True Label: 1
File: singlet

In [None]:
#Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 0, True Label: 1
File: nons (7).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (41).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 1, True Label: 0
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: nons (22).java, Predicted Label: 0, True Label: 0
File: singleton (3).ja

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (104).java, Predicted Label: 0, True Label: 0
File: nonb (102).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (86).java, Predicted Label: 0, True Label: 0
File: nonb (81).java, Predicted Label: 0, True Label: 0
File: nonb (107).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (92).java, Predicted Label: 0, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (116).java, Predicted Label: 1, True Label: 0
File: nonb (38).java, Predicted Label: 0, True Label: 0
File: builder (8).java, Predicted Lab

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 0, True Label: 1
File: nonb (121).java, Predicted Label: 0, True Label: 0
File: nonb (21).java, Predicted Label: 1, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (85).java, Predicted Label: 0, True Label: 0
File: nonb (120).java, Predicted Label: 0, True Label: 0
File: nonb (107).java, Predicted Label: 1, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (80).java, Predicted Label: 0, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (15).java, Predicted Label: 0, True Label: 0
File: nonb (101).java, Predicted Label: 0, True Label: 0
File: nonb (129).java, Predicted Label: 0, True Label: 0
File: nonb (42).java, Predicted Label: 0, True Label: 0
File: nonb (16).java, Predicted Label

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 1, True Label: 1
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (92).java, Predicted Label: 1, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (13).java, Predicted Label: 0, True Label: 0
File: nonb (119).java, Predicted Label: 1, True Label: 0
File: builder (8).java, Predicted Label: 1, True Label: 1
File: nonb (23).java, Predicted Label: 0, True Label: 0
File: builder (1).java, Predicted Label: 0, True Label: 1
File: builder (4).java, Predicted La

In [None]:
#Builder with different settings

In [None]:
#Builder with different settings

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (6).java, Predicted Label: 0, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (12).java, Predicted Label: 0, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (18).java, Predicted Label: 0, True Label: 0
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
F

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (89).java, Predicted Label: 0, True Label: 0
File: nonb (28).java, Predicted Label: 0, True Label: 0
File: nonb (36).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (44).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (87).java, Predicted Label: 0, True Label: 0
File: nonb (120).java, Predicted Label: 0, True Label: 0
File: nonb (69).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (129).java, Predicted Label: 0, True Label: 0
File: nonb (38).java, Predicted Label: 0, True Label: 0
File: nonb (114).java, Predicted Label: 0, True Label: 0
File: builder (8).java, Predicted Label

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (6).java, Predicted Label: 0, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (12).java, Predicted Label: 1, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 1, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: factorymethod (4).java, Predicted Label: 0, True Label: 1
File: factorymethod (3).java, Predicted Label: 1, T

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (38).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: nonfm (29).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (24).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (17).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 0, True Label: 1
File: factorymethod (3).java, Predicted Label: 1, True Label: 1
File: nonfm (34).java, Predicted Label: 

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (77).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (75).java, Predicted Label: 0, True Label: 0
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (78).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 1, True Label: 1
File: nonfm (82).java, Predicted Label: 1, True Label: 0
File: nonfm (74).java, Predicted Label: 0, True Label: 0
File: factorymethod (3).java, Predicted Label

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (46).java, Predicted Label: 1, True Label: 0
File: nonfm (37).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 1, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (23).java, Predicted Label: 0, True Label: 0
File: nonfm (26).java, Predicted Label: 1, True Label: 0
File: nonfm (33).java, Predicted Label: 1, True Label: 0
File: nonfm (17).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 0, Tru

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (37).java, Predicted Label: 0, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: nonp (36).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: nonp (28).java, Predicted 

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (37).java, Predicted Label: 1, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: nonp (58).java, Predicted Label: 1, True Label: 0
File: prototype (14).java, Predicted Label: 0, True Label: 1
File: nonp (28).java, Predicted Label: 0, True Label: 0
File: prototype (16).java, Predicted Label: 1, True Label: 1
File: prototype (18).java, Predicted Label: 0, True Label: 1
File: prototype (32).java, Predicted Label: 1, True Label: 1
File: prototype

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (37).java, Predicted Label: 1, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: prototype (16).java, Predicted Label: 1, True Label: 1
File: prototype (18).java, Predicted Label: 0, True Label: 1
File: nonp (22).java,

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: nonp (28).java, Predicted Label: 0, True Label: 0
File: prototype (16).java, Predicted Label: 1, True Label: 1
File: prototype (18).java, Predicted Label: 0, True Label: 1
File: nonp (22).java, 

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (22).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 0, True Label: 0
File: nonab (30).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: nonab (70).java, Predicted Label: 0, True Label: 0
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 0, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab 

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: nonab (15).java, Predicted Label: 0, True Label: 0
File: nonab (1).java, Predicted Label: 1, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: nonab (10).java, Predicted Label: 0, True Label: 0
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted La

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: nonab (10).java, Predicted Label: 0, True Label: 0
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: nonab (51).java, Predicted Label: 0, True Label: 0
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (69).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 0, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab (14).java, Predicted Label: 1, True Label: 0
File: nonab (36).java, Predicted Label: 0, True Label: 0
File: nonab (42).java, Predicted Label: 0, True Label: 0
File: abst

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: nonab (22).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 0, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab (59).java, Predicted Label: 1, True Label: 0
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
File: nonab (32).java, Predicted Label: 1, True Label: 0
F

In [None]:
#Abstract factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1

In [None]:
#tsne plot for programs

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to get embeddings for a given design pattern
def get_embeddings_for_pattern(pattern, model, tokenizer):
    directory = os.path.join("all_design_patterns", pattern.lower())
    files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

    embeddings = []
    true_labels = []

    for file in files:
        with open(os.path.join(directory, file), "r", encoding="ISO-8859-1") as f:
            code = f.read()

        # Tokenize and encode the Java program
        inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        program_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        embeddings.append(program_embedding)
        true_labels.append(pattern)

    return np.array(embeddings), np.array(true_labels)

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get embeddings for each design pattern
patterns = ["Singleton", "Prototype", "AbstractFactory", "Builder", "FactoryMethod"]
all_embeddings = []
all_labels = []

# Custom color palette for each design pattern with higher contrast
color_palette = ["red", "green", "orange", "blue", "purple"]

# Custom markers for each design pattern
markers = ["o", "s", "D", "^", "P"]

for i, pattern in enumerate(patterns):
    pattern_embeddings, pattern_labels = get_embeddings_for_pattern(pattern, model, tokenizer)
    all_embeddings.append(pattern_embeddings)
    all_labels.append(pattern_labels)

# Concatenate the embeddings and labels
all_embeddings = np.concatenate(all_embeddings, axis=0)
all_labels = np.concatenate(all_labels)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(all_embeddings)

# Create a scatter plot for t-SNE visualization with custom symbols
plt.figure(figsize=(20, 16))

for i, pattern in enumerate(patterns):
    indices = all_labels == pattern
    sns.scatterplot(x=tsne_results[indices, 0], y=tsne_results[indices, 1], marker=markers[i], color=color_palette[i], s=200, label=pattern)

# Increase font sizes for better visibility
plt.title('t-SNE Visualization for CodeGPT on Different Design Patterns', fontsize=30)
plt.xlabel('t-SNE Dimension 1', fontsize=25)
plt.ylabel('t-SNE Dimension 2', fontsize=25)
plt.legend(title='Design Pattern', loc='upper right', fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Save the t-SNE plot as a PDF file
plt.savefig('tsne_plot_codegpt.pdf', format='pdf')
plt.show()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


**Builder using different settings**

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "/content/builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda" if torch.cuda.is_available() else "cpu")

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU for NumPy
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class
    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Choose distance metric: 'cosine' or 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Start the timer for training time
start_training_time = time.time()

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# End training time and record
end_training_time = time.time()
training_time = end_training_time - start_training_time

# Start the timer for prediction time
start_prediction_time = time.time()

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# End prediction time and record
end_prediction_time = time.time()
prediction_time = end_prediction_time - start_prediction_time

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Calculate total execution time (training + prediction)
execution_time = training_time + prediction_time

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"\nPrecision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")
print(f"\nTraining Time: {training_time:.2f} seconds")
print(f"Prediction Time: {prediction_time:.2f} seconds")
print(f"Total Execution Time: {execution_time:.2f} seconds")


K-Nearest Neighbors (KNN) Classification Results:
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (1).java, Predicted Label: 1, True Label: 0
File: nonb (2).java, Predicted Label: 0, True Label: 0
File: nonb (5).java, Predicted Label: 0, True Label: 0
File: builder (8).java, Predicted Label: 1, True Label: 1
File: builder (3).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 1, True Label: 1
File: nonb (9).java, Predicted Label: 0, True Label: 0
File: builder (1).java, Predicted Label: 0, True Label: 1
File: nonb (3).java, Predicted Label: 0, True Label: 0
File: nonb (7).java, Predicted Label: 0, True Label: 0
File: nonb (4).java, Predicted Label: 0, True Label: 0
File: builder (4).java, Predicted Label: 0, True Label: 1
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (6).java, Predicted Label: 0, True Label: 0
File: builder (6).java, Predicted Label: 1, True Label: 1
File: builder (7).java, Predicted Label: 1, Tr

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "/content/builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in java_files if "builder" in file]
negative_files = [file for file in java_files if "builder" not in file]

# Randomly select the same number of positive examples as negative examples
num_negative = len(negative_files)
n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive files and combine with negative ones
    # Separate positive and negative classes
  positive_files = [file for file in java_files if "builder" in file]
  negative_files = [file for file in java_files if "builder" not in file]

# Randomly select the same number of positive examples as negative examples
num_negative = len(negative_files)
num_positive = len(positive_files)

# Ensure the number of samples doesn't exceed the available examples
num_samples = min(num_positive, num_negative)

n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive files and combine with negative ones
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)  # Adjust this line

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    for java_file in selected_files:
        label = 1 if "builder" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)

        with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()

        # Calculate the embedding for the entire program
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    # Convert the list of embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Choose distance metric: 'cosine' or 'euclidean'
    distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

    # Start the timer for training time
    start_training_time = time.time()

    # Calculate the distance matrix based on the chosen metric
    if distance_metric == 'cosine':
        distance_matrix = cosine_distances(program_embeddings)
    elif distance_metric == 'euclidean':
        distance_matrix = euclidean_distances(program_embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {distance_metric}")

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time and record
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start the timer for prediction time
    start_prediction_time = time.time()

    # Initialize an array to store predicted labels
    predicted_labels = []

    # Predict labels for each program based on the majority label of neighbors
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time and record
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate total time for this iteration
    total_time = training_time + prediction_time
    total_times.append(total_time)

# Calculate the mean of training, prediction, and total times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
mean_total_time = np.mean(total_times)

# Calculate precision, recall, and f1-score
predicted_labels = np.array(predicted_labels)
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the mean times and performance metrics
print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")
print(f"Mean Total Time: {mean_total_time:.2f} ms")

print(f"\nPrecision (KNN): {precision_knn:.2f}")
print(f"Recall (KNN): {recall_knn:.2f}")
print(f"F1 Score (KNN): {f1_knn:.2f}")



Mean Training Time: 0.85 ms
Mean Prediction Time: 0.09 ms
Mean Total Time: 0.93 ms

Precision (KNN): 0.62
Recall (KNN): 0.61
F1 Score (KNN): 0.60


In [None]:
import shutil

# Replace 'your_directory_path' with the path to the directory you want to delete
shutil.rmtree('/content/builder')


In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "/content/singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in java_files if "singleton" in file]
negative_files = [file for file in java_files if "singleton" not in file]

# Randomly select the same number of positive examples as negative examples
num_negative = len(negative_files)
n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive files and combine with negative ones
    # Separate positive and negative classes
  positive_files = [file for file in java_files if "singleton" in file]
  negative_files = [file for file in java_files if "singleton" not in file]

# Randomly select the same number of positive examples as negative examples
num_negative = len(negative_files)
num_positive = len(positive_files)

# Ensure the number of samples doesn't exceed the available examples
num_samples = min(num_positive, num_negative)

n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive files and combine with negative ones
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)  # Adjust this line

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    for java_file in selected_files:
        label = 1 if "singleton" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)

        with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()

        # Calculate the embedding for the entire program
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    # Convert the list of embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Choose distance metric: 'cosine' or 'euclidean'
    distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

    # Start the timer for training time
    start_training_time = time.time()

    # Calculate the distance matrix based on the chosen metric
    if distance_metric == 'cosine':
        distance_matrix = cosine_distances(program_embeddings)
    elif distance_metric == 'euclidean':
        distance_matrix = euclidean_distances(program_embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {distance_metric}")

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time and record
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start the timer for prediction time
    start_prediction_time = time.time()

    # Initialize an array to store predicted labels
    predicted_labels = []

    # Predict labels for each program based on the majority label of neighbors
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time and record
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate total time for this iteration
    total_time = training_time + prediction_time
    total_times.append(total_time)

# Calculate the mean of training, prediction, and total times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
mean_total_time = np.mean(total_times)

# Calculate precision, recall, and f1-score
predicted_labels = np.array(predicted_labels)
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the mean times and performance metrics
print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")
print(f"Mean Total Time: {mean_total_time:.2f} ms")

print(f"\nPrecision (KNN): {precision_knn:.2f}")
print(f"Recall (KNN): {recall_knn:.2f}")
print(f"F1 Score (KNN): {f1_knn:.2f}")





Mean Training Time: 0.99 ms
Mean Prediction Time: 0.19 ms
Mean Total Time: 1.18 ms

Precision (KNN): 0.58
Recall (KNN): 0.58
F1 Score (KNN): 0.58


**prototype time calculation**

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "/content/prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in java_files if "prototype" in file]
negative_files = [file for file in java_files if "prototype" not in file]

# Randomly select the same number of positive examples as negative examples
num_negative = len(negative_files)
n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive files and combine with negative ones
    # Separate positive and negative classes
  positive_files = [file for file in java_files if "prototype" in file]
  negative_files = [file for file in java_files if "prototype" not in file]

# Randomly select the same number of positive examples as negative examples
num_negative = len(negative_files)
num_positive = len(positive_files)

# Ensure the number of samples doesn't exceed the available examples
num_samples = min(num_positive, num_negative)

n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive files and combine with negative ones
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)  # Adjust this line

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    for java_file in selected_files:
        label = 1 if "prototype" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)

        with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()

        # Calculate the embedding for the entire program
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    # Convert the list of embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Choose distance metric: 'cosine' or 'euclidean'
    distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

    # Start the timer for training time
    start_training_time = time.time()

    # Calculate the distance matrix based on the chosen metric
    if distance_metric == 'cosine':
        distance_matrix = cosine_distances(program_embeddings)
    elif distance_metric == 'euclidean':
        distance_matrix = euclidean_distances(program_embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {distance_metric}")

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time and record
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start the timer for prediction time
    start_prediction_time = time.time()

    # Initialize an array to store predicted labels
    predicted_labels = []

    # Predict labels for each program based on the majority label of neighbors
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time and record
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate total time for this iteration
    total_time = training_time + prediction_time
    total_times.append(total_time)

# Calculate the mean of training, prediction, and total times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
mean_total_time = np.mean(total_times)

# Calculate precision, recall, and f1-score
predicted_labels = np.array(predicted_labels)
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the mean times and performance metrics
print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")
print(f"Mean Total Time: {mean_total_time:.2f} ms")

print(f"\nPrecision (KNN): {precision_knn:.2f}")
print(f"Recall (KNN): {recall_knn:.2f}")
print(f"F1 Score (KNN): {f1_knn:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/871 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/469 [00:00<?, ?B/s]




Mean Training Time: 2.00 ms
Mean Prediction Time: 0.24 ms
Mean Total Time: 2.24 ms

Precision (KNN): 0.88
Recall (KNN): 0.88
F1 Score (KNN): 0.87


In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "/content/prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in java_files if "prototype" in file]
negative_files = [file for file in java_files if "prototype" not in file]

# Ensure the number of samples doesn't exceed the available examples
num_samples = min(len(positive_files), len(negative_files))

n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    # Generate embeddings for each program
    for java_file in selected_files:
        label = 1 if "prototype" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)

        with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()

        # Calculate the embedding for the entire program
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    # Convert the list of embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Choose distance metric: 'cosine' or 'euclidean'
    distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

    # Calculate the distance matrix based on the chosen metric
    if distance_metric == 'cosine':
        distance_matrix = cosine_distances(program_embeddings)
    elif distance_metric == 'euclidean':
        distance_matrix = euclidean_distances(program_embeddings)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    start_prediction_time = time.time()  # Start timer for prediction

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # Initialize an array to store predicted labels
    predicted_labels = []

    # Predict labels for each program based on the majority label of neighbors
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate total time for this iteration
    total_time = training_time + prediction_time
    total_times.append(total_time)

# Calculate the mean of training, prediction, and total times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
mean_total_time = np.mean(total_times)

# Calculate precision, recall, and f1-score
predicted_labels = np.array(predicted_labels)
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the mean times and performance metrics
print(f"\nMean Training Time (Embedding + KNN): {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")
print(f"Mean Total Time: {mean_total_time:.2f} ms")

print(f"\nPrecision (KNN): {precision_knn:.2f}")
print(f"Recall (KNN): {recall_knn:.2f}")
print(f"F1 Score (KNN): {f1_knn:.2f}")





Mean Training Time (Embedding + KNN): 98066.57 ms
Mean Prediction Time: 0.31 ms
Mean Total Time: 98066.88 ms

Precision (KNN): 0.82
Recall (KNN): 0.81
F1 Score (KNN): 0.81


In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "/content/singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in java_files if "singleton" in file]
negative_files = [file for file in java_files if "singleton" not in file]

# Ensure the number of samples doesn't exceed the available examples
num_samples = min(len(positive_files), len(negative_files))

n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    # Generate embeddings for each program
    for java_file in selected_files:
        label = 1 if "singleton" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)

        with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()

        # Calculate the embedding for the entire program
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    # Convert the list of embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Choose distance metric: 'cosine' or 'euclidean'
    distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

    # Calculate the distance matrix based on the chosen metric
    if distance_metric == 'cosine':
        distance_matrix = cosine_distances(program_embeddings)
    elif distance_metric == 'euclidean':
        distance_matrix = euclidean_distances(program_embeddings)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    start_prediction_time = time.time()  # Start timer for prediction

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # Initialize an array to store predicted labels
    predicted_labels = []

    # Predict labels for each program based on the majority label of neighbors
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate total time for this iteration
    total_time = training_time + prediction_time
    total_times.append(total_time)

# Calculate the mean of training, prediction, and total times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
mean_total_time = np.mean(total_times)

# Calculate precision, recall, and f1-score
predicted_labels = np.array(predicted_labels)
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the mean times and performance metrics
print(f"\nMean Training Time (Embedding + KNN): {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")
print(f"Mean Total Time: {mean_total_time:.2f} ms")

print(f"\nPrecision (KNN): {precision_knn:.2f}")
print(f"Recall (KNN): {recall_knn:.2f}")
print(f"F1 Score (KNN): {f1_knn:.2f}")



Mean Training Time (Embedding + KNN): 86958.03 ms
Mean Prediction Time: 0.40 ms
Mean Total Time: 86958.44 ms

Precision (KNN): 0.64
Recall (KNN): 0.64
F1 Score (KNN): 0.64


In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "/content/singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in java_files if "singleton" in file]
negative_files = [file for file in java_files if "singleton" not in file]

# Ensure the number of samples doesn't exceed the available examples
num_samples = min(len(positive_files), len(negative_files))

n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
total_times = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    # Generate embeddings for each program
    for java_file in selected_files:
        label = 1 if "singleton" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)

        with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()

        # Calculate the embedding for the entire program
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    # Convert the list of embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Choose distance metric: 'cosine' or 'euclidean'
    distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

    # Calculate the distance matrix based on the chosen metric
    if distance_metric == 'cosine':
        distance_matrix = cosine_distances(program_embeddings)
    elif distance_metric == 'euclidean':
        distance_matrix = euclidean_distances(program_embeddings)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    start_prediction_time = time.time()  # Start timer for prediction

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # Initialize an array to store predicted labels
    predicted_labels = []

    # Predict labels for each program based on the majority label of neighbors
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate total time for this iteration
    total_time = training_time + prediction_time
    total_times.append(total_time)

# Calculate the mean of training, prediction, and total times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
mean_total_time = np.mean(total_times)

# Calculate precision, recall, and f1-score
predicted_labels = np.array(predicted_labels)
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the mean times and performance metrics
print(f"\nMean Training Time (Embedding + KNN): {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")
print(f"Mean Total Time: {mean_total_time:.2f} ms")

print(f"\nPrecision (KNN): {precision_knn:.2f}")
print(f"Recall (KNN): {recall_knn:.2f}")
print(f"F1 Score (KNN): {f1_knn:.2f}")





Mean Training Time (Embedding + KNN): 86963.55 ms
Mean Prediction Time: 0.41 ms
Mean Total Time: 86963.96 ms

Precision (KNN): 0.64
Recall (KNN): 0.64
F1 Score (KNN): 0.64


In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directories containing your Java programs (train/test)
train_directory = "/content/singleton"  # Change to your dataset directory
unseen_directory = "/content/unseen"  # Directory for unseen examples

train_files = [file for file in os.listdir(train_directory) if file.endswith(".java")]
unseen_files = [file for file in os.listdir(unseen_directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in train_files if "singleton" in file]
negative_files = [file for file in train_files if "singleton" not in file]

num_samples = min(len(positive_files), len(negative_files))
n_runs = 3  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
precision_scores = []
recall_scores = []
f1_scores = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    for java_file in selected_files:
        label = 1 if "singleton" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)
        with open(os.path.join(train_directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.array(program_embeddings)

    # Train the k-NN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(program_embeddings, true_labels)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1e6  # Convert to microseconds
    training_times.append(training_time)

    # Predict for the train files (optional: validation prediction during training)
    predicted_labels = knn_model.predict(program_embeddings)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Prediction for unseen examples
    start_prediction_time = time.time()  # Start timer for unseen prediction
    unseen_embeddings = []

    for unseen_file in unseen_files:
        with open(os.path.join(unseen_directory, unseen_file), "r", encoding="ISO-8859-1") as file:
            unseen_code = file.read()
        unseen_embedding = get_program_embedding(unseen_code)
        unseen_embeddings.append(unseen_embedding)

    unseen_embeddings = np.array(unseen_embeddings)
    unseen_predictions = knn_model.predict(unseen_embeddings)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(prediction_time)

# Calculate mean and std for training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
std_training_time = np.std(training_times)
std_prediction_time = np.std(prediction_times)

# Calculate mean and std for precision, recall, and F1
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)

std_precision = np.std(precision_scores)
std_recall = np.std(recall_scores)
std_f1 = np.std(f1_scores)

# Print the results
print(f"Mean Training Time: {mean_training_time:.2f} µs (±{std_training_time:.2f} µs)")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs (±{std_prediction_time:.2f} µs)")
print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")


Mean Training Time: 879079682.59 µs (±54750488.66 µs)
Mean Prediction Time: 371080019.71 µs (±1371153.02 µs)

Mean Precision: 0.81 (±0.01)
Mean Recall: 0.79 (±0.01)
Mean F1 Score: 0.79 (±0.01)


In [None]:
import shutil

# Replace 'your_directory_path' with the path to the directory you want to delete
shutil.rmtree('/content/singleton')


**Builder, standard devation, trainig and prediction time**

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directories containing your Java programs (train/test)
train_directory = "/content/builder"  # Change to your dataset directory
unseen_directory = "/content/unseen"  # Directory for unseen examples

train_files = [file for file in os.listdir(train_directory) if file.endswith(".java")]
unseen_files = [file for file in os.listdir(unseen_directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in train_files if "builder" in file]
negative_files = [file for file in train_files if "builder" not in file]

num_samples = min(len(positive_files), len(negative_files))
n_runs = 3  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
precision_scores = []
recall_scores = []
f1_scores = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    for java_file in selected_files:
        label = 1 if "builder" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)
        with open(os.path.join(train_directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.array(program_embeddings)

    # Train the k-NN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(program_embeddings, true_labels)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1e6  # Convert to microseconds
    training_times.append(training_time)

    # Predict for the train files (optional: validation prediction during training)
    predicted_labels = knn_model.predict(program_embeddings)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Prediction for unseen examples
    start_prediction_time = time.time()  # Start timer for unseen prediction
    unseen_embeddings = []

    for unseen_file in unseen_files:
        with open(os.path.join(unseen_directory, unseen_file), "r", encoding="ISO-8859-1") as file:
            unseen_code = file.read()
        unseen_embedding = get_program_embedding(unseen_code)
        unseen_embeddings.append(unseen_embedding)

    unseen_embeddings = np.array(unseen_embeddings)
    unseen_predictions = knn_model.predict(unseen_embeddings)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(prediction_time)

# Calculate mean and std for training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
std_training_time = np.std(training_times)
std_prediction_time = np.std(prediction_times)

# Calculate mean and std for precision, recall, and F1
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)

std_precision = np.std(precision_scores)
std_recall = np.std(recall_scores)
std_f1 = np.std(f1_scores)

# Print the results
print(f"Mean Training Time: {mean_training_time:.2f} µs (±{std_training_time:.2f} µs)")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs (±{std_prediction_time:.2f} µs)")
print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")




Mean Training Time: 383745267.31 µs (±17093668.05 µs)
Mean Prediction Time: 374819315.75 µs (±304521.64 µs)

Mean Precision: 0.82 (±0.06)
Mean Recall: 0.81 (±0.05)
Mean F1 Score: 0.81 (±0.05)


In [None]:
import shutil

# Replace 'your_directory_path' with the path to the directory you want to delete
shutil.rmtree('/content/builder')


**Abstract Factory, standard deviation, training and prediction time**

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directories containing your Java programs (train/test)
train_directory = "/content/abstractfactory"  # Change to your dataset directory
unseen_directory = "/content/unseen"  # Directory for unseen examples

train_files = [file for file in os.listdir(train_directory) if file.endswith(".java")]
unseen_files = [file for file in os.listdir(unseen_directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in train_files if "abstractfactory" in file]
negative_files = [file for file in train_files if "abstractfactory" not in file]

num_samples = min(len(positive_files), len(negative_files))
n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
precision_scores = []
recall_scores = []
f1_scores = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    for java_file in selected_files:
        label = 1 if "abstractfactory" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)
        with open(os.path.join(train_directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.array(program_embeddings)

    # Train the k-NN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(program_embeddings, true_labels)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1e6  # Convert to microseconds
    training_times.append(training_time)

    # Predict for the train files (optional: validation prediction during training)
    predicted_labels = knn_model.predict(program_embeddings)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Prediction for unseen examples
    start_prediction_time = time.time()  # Start timer for unseen prediction
    unseen_embeddings = []

    for unseen_file in unseen_files:
        with open(os.path.join(unseen_directory, unseen_file), "r", encoding="ISO-8859-1") as file:
            unseen_code = file.read()
        unseen_embedding = get_program_embedding(unseen_code)
        unseen_embeddings.append(unseen_embedding)

    unseen_embeddings = np.array(unseen_embeddings)
    unseen_predictions = knn_model.predict(unseen_embeddings)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(prediction_time)

# Calculate mean and std for training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
std_training_time = np.std(training_times)
std_prediction_time = np.std(prediction_times)

# Calculate mean and std for precision, recall, and F1
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)

std_precision = np.std(precision_scores)
std_recall = np.std(recall_scores)
std_f1 = np.std(f1_scores)

# Print the results
print(f"Mean Training Time: {mean_training_time:.2f} µs (±{std_training_time:.2f} µs)")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs (±{std_prediction_time:.2f} µs)")
print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/871 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/469 [00:00<?, ?B/s]



Mean Training Time: 41361883.04 µs (±3487329.56 µs)
Mean Prediction Time: 27230329.75 µs (±278788.76 µs)

Mean Precision: 0.93 (±0.02)
Mean Recall: 0.91 (±0.03)
Mean F1 Score: 0.91 (±0.03)


In [None]:
import shutil

# Replace 'your_directory_path' with the path to the directory you want to delete
shutil.rmtree('/content/abstractfactory')


**Factory method, standard deviation, training, prediction time**

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directories containing your Java programs (train/test)
train_directory = "/content/factorymethod"  # Change to your dataset directory
unseen_directory = "/content/unseen"  # Directory for unseen examples

train_files = [file for file in os.listdir(train_directory) if file.endswith(".java")]
unseen_files = [file for file in os.listdir(unseen_directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in train_files if "factorymethod" in file]
negative_files = [file for file in train_files if "factorymethod" not in file]

num_samples = min(len(positive_files), len(negative_files))
n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
precision_scores = []
recall_scores = []
f1_scores = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    for java_file in selected_files:
        label = 1 if "factorymethod" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)
        with open(os.path.join(train_directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.array(program_embeddings)

    # Train the k-NN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(program_embeddings, true_labels)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1e6  # Convert to microseconds
    training_times.append(training_time)

    # Predict for the train files (optional: validation prediction during training)
    predicted_labels = knn_model.predict(program_embeddings)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Prediction for unseen examples
    start_prediction_time = time.time()  # Start timer for unseen prediction
    unseen_embeddings = []

    for unseen_file in unseen_files:
        with open(os.path.join(unseen_directory, unseen_file), "r", encoding="ISO-8859-1") as file:
            unseen_code = file.read()
        unseen_embedding = get_program_embedding(unseen_code)
        unseen_embeddings.append(unseen_embedding)

    unseen_embeddings = np.array(unseen_embeddings)
    unseen_predictions = knn_model.predict(unseen_embeddings)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(prediction_time)

# Calculate mean and std for training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
std_training_time = np.std(training_times)
std_prediction_time = np.std(prediction_times)

# Calculate mean and std for precision, recall, and F1
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)

std_precision = np.std(precision_scores)
std_recall = np.std(recall_scores)
std_f1 = np.std(f1_scores)

# Print the results
print(f"Mean Training Time: {mean_training_time:.2f} µs (±{std_training_time:.2f} µs)")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs (±{std_prediction_time:.2f} µs)")
print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")




Mean Training Time: 39734963.35 µs (±5854448.09 µs)
Mean Prediction Time: 31581929.37 µs (±2720914.01 µs)

Mean Precision: 0.78 (±0.07)
Mean Recall: 0.77 (±0.07)
Mean F1 Score: 0.77 (±0.07)


In [None]:
import shutil

# Replace 'your_directory_path' with the path to the directory you want to delete
shutil.rmtree('/content/factorymethod')


**Prototype, standard deviation, training and prediction time**

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import random

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU if available
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directories containing your Java programs (train/test)
train_directory = "/content/prototype"  # Change to your dataset directory
unseen_directory = "/content/unseen"  # Directory for unseen examples

train_files = [file for file in os.listdir(train_directory) if file.endswith(".java")]
unseen_files = [file for file in os.listdir(unseen_directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move tensor back to CPU
            line_embeddings.append(line_embedding)
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Separate positive and negative classes
positive_files = [file for file in train_files if "prototype" in file]
negative_files = [file for file in train_files if "prototype" not in file]

num_samples = min(len(positive_files), len(negative_files))
n_runs = 10  # Number of iterations for calculating mean times
k = 5  # Number of neighbors for KNN

training_times = []
prediction_times = []
precision_scores = []
recall_scores = []
f1_scores = []

for _ in range(n_runs):
    # Sample positive and negative files
    positive_files_sampled = random.sample(positive_files, num_samples)
    negative_files_sampled = random.sample(negative_files, num_samples)

    selected_files = positive_files_sampled + negative_files_sampled
    random.shuffle(selected_files)

    # Store the embeddings of each program and define true labels
    program_embeddings = []
    true_labels = []

    start_training_time = time.time()  # Start timer for training

    for java_file in selected_files:
        label = 1 if "prototype" in java_file else 0  # 1 for positive, 0 for negative
        true_labels.append(label)
        with open(os.path.join(train_directory, java_file), "r", encoding="ISO-8859-1") as file:
            code = file.read()
        program_embedding = get_program_embedding(code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.array(program_embeddings)

    # Train the k-NN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(program_embeddings, true_labels)

    end_training_time = time.time()  # End timer for training
    training_time = (end_training_time - start_training_time) * 1e6  # Convert to microseconds
    training_times.append(training_time)

    # Predict for the train files (optional: validation prediction during training)
    predicted_labels = knn_model.predict(program_embeddings)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # Prediction for unseen examples
    start_prediction_time = time.time()  # Start timer for unseen prediction
    unseen_embeddings = []

    for unseen_file in unseen_files:
        with open(os.path.join(unseen_directory, unseen_file), "r", encoding="ISO-8859-1") as file:
            unseen_code = file.read()
        unseen_embedding = get_program_embedding(unseen_code)
        unseen_embeddings.append(unseen_embedding)

    unseen_embeddings = np.array(unseen_embeddings)
    unseen_predictions = knn_model.predict(unseen_embeddings)

    end_prediction_time = time.time()  # End timer for prediction
    prediction_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(prediction_time)

# Calculate mean and std for training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)
std_training_time = np.std(training_times)
std_prediction_time = np.std(prediction_times)

# Calculate mean and std for precision, recall, and F1
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)
mean_f1 = np.mean(f1_scores)

std_precision = np.std(precision_scores)
std_recall = np.std(recall_scores)
std_f1 = np.std(f1_scores)

# Print the results
print(f"Mean Training Time: {mean_training_time:.2f} µs (±{std_training_time:.2f} µs)")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs (±{std_prediction_time:.2f} µs)")
print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")




Mean Training Time: 90519440.17 µs (±7306010.01 µs)
Mean Prediction Time: 29134436.46 µs (±333912.74 µs)

Mean Precision: 0.92 (±0.02)
Mean Recall: 0.92 (±0.02)
Mean F1 Score: 0.92 (±0.02)
