In [1]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 1, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (61).java, Predicted Label: 1, True Label: 0
File: nons (68).java, Predicted Label: 1, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (1).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 1, True Label: 0
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: singleton (3).ja

In [1]:
#Singleton with different settings

In [2]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (3).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: singleton (3).java, Predicted Label: 0, True Label: 1
File: nons (53).java, Predicted Label: 1, True Label: 0
File: singleton (2

In [None]:
#Singleton with different settings

In [3]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 1, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (3).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, P

In [None]:
#Singleton with different settings

In [4]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 1, True Label: 0
File: nons (18).java, Predicted Label: 1, True Label: 0
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (42).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (7).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (41).java, Predicted Label: 0, True Label: 0
File: nons (1).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predi

In [None]:
#Singleton with different settings

In [5]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r") as file:
        code = file.read()

    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 1, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (41).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: nons (22).java, Predicted Label: 1, True Label: 0
File: singleton (3).java, Predicted Label: 1, True Label: 1
File: singleton (23).java, Predicted Label: 1, True Label: 1
File: singlet

In [None]:
#Singleton with different settings

In [10]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "singleton"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "singleton" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 0, True Label: 1
File: nons (7).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (41).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 1, True Label: 0
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: nons (22).java, Predicted Label: 0, True Label: 0
File: singleton (3).ja

In [None]:
#Builder with different settings

In [11]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (104).java, Predicted Label: 0, True Label: 0
File: nonb (102).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (86).java, Predicted Label: 0, True Label: 0
File: nonb (81).java, Predicted Label: 0, True Label: 0
File: nonb (107).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (92).java, Predicted Label: 0, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (116).java, Predicted Label: 1, True Label: 0
File: nonb (38).java, Predicted Label: 0, True Label: 0
File: builder (8).java, Predicted Lab

In [None]:
#Builder with different settings

In [12]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 0, True Label: 1
File: nonb (121).java, Predicted Label: 0, True Label: 0
File: nonb (21).java, Predicted Label: 1, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (85).java, Predicted Label: 0, True Label: 0
File: nonb (120).java, Predicted Label: 0, True Label: 0
File: nonb (107).java, Predicted Label: 1, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (80).java, Predicted Label: 0, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (15).java, Predicted Label: 0, True Label: 0
File: nonb (101).java, Predicted Label: 0, True Label: 0
File: nonb (129).java, Predicted Label: 0, True Label: 0
File: nonb (42).java, Predicted Label: 0, True Label: 0
File: nonb (16).java, Predicted Label

In [13]:
#Builder with different settings

In [15]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 1, True Label: 1
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (92).java, Predicted Label: 1, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (13).java, Predicted Label: 0, True Label: 0
File: nonb (119).java, Predicted Label: 1, True Label: 0
File: builder (8).java, Predicted Label: 1, True Label: 1
File: nonb (23).java, Predicted Label: 0, True Label: 0
File: builder (1).java, Predicted Label: 0, True Label: 1
File: builder (4).java, Predicted La

In [2]:
#Builder with different settings

In [4]:
#Builder with different settings

In [None]:
#Factory Method with different settings

In [16]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (6).java, Predicted Label: 0, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (12).java, Predicted Label: 0, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (18).java, Predicted Label: 0, True Label: 0
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
F

In [6]:
#Builder with different settings

In [7]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "builder"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "builder" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (89).java, Predicted Label: 0, True Label: 0
File: nonb (28).java, Predicted Label: 0, True Label: 0
File: nonb (36).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (44).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (87).java, Predicted Label: 0, True Label: 0
File: nonb (120).java, Predicted Label: 0, True Label: 0
File: nonb (69).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (129).java, Predicted Label: 0, True Label: 0
File: nonb (38).java, Predicted Label: 0, True Label: 0
File: nonb (114).java, Predicted Label: 0, True Label: 0
File: builder (8).java, Predicted Label

In [None]:
#Factory Method with different settings

In [17]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (6).java, Predicted Label: 0, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (12).java, Predicted Label: 1, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 1, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: factorymethod (4).java, Predicted Label: 0, True Label: 1
File: factorymethod (3).java, Predicted Label: 1, T

In [None]:
#Factory Method with different settings

In [18]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (38).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: nonfm (29).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (24).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (17).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 0, True Label: 1
File: factorymethod (3).java, Predicted Label: 1, True Label: 1
File: nonfm (34).java, Predicted Label: 

In [8]:
#Factory Method with different settings

In [9]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (77).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (75).java, Predicted Label: 0, True Label: 0
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (78).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 1, True Label: 1
File: nonfm (82).java, Predicted Label: 1, True Label: 0
File: nonfm (74).java, Predicted Label: 0, True Label: 0
File: factorymethod (3).java, Predicted Label

In [None]:
#Factory Method with different settings

In [10]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "factorymethod"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "factorymethod" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (46).java, Predicted Label: 1, True Label: 0
File: nonfm (37).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 1, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (23).java, Predicted Label: 0, True Label: 0
File: nonfm (26).java, Predicted Label: 1, True Label: 0
File: nonfm (33).java, Predicted Label: 1, True Label: 0
File: nonfm (17).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 0, Tru

In [None]:
#Prototype with different settings

In [19]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label

In [None]:
#Prototype with different settings

In [20]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (37).java, Predicted Label: 0, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: nonp (36).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: nonp (28).java, Predicted 

In [None]:
#Prototype with different settings

In [21]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (37).java, Predicted Label: 1, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: nonp (58).java, Predicted Label: 1, True Label: 0
File: prototype (14).java, Predicted Label: 0, True Label: 1
File: nonp (28).java, Predicted Label: 0, True Label: 0
File: prototype (16).java, Predicted Label: 1, True Label: 1
File: prototype (18).java, Predicted Label: 0, True Label: 1
File: prototype (32).java, Predicted Label: 1, True Label: 1
File: prototype

In [None]:
#Prototype with different settings

In [22]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (37).java, Predicted Label: 1, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: prototype (16).java, Predicted Label: 1, True Label: 1
File: prototype (18).java, Predicted Label: 0, True Label: 1
File: nonp (22).java,

In [None]:
#Prototype with different settings

In [1]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "prototype"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "prototype" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (21).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: nonp (28).java, Predicted Label: 0, True Label: 0
File: prototype (16).java, Predicted Label: 1, True Label: 1
File: prototype (18).java, Predicted Label: 0, True Label: 1
File: nonp (22).java, 

In [None]:
#Abstract Factory with different settings

In [23]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (22).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 0, True Label: 0
File: nonab (30).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: nonab (70).java, Predicted Label: 0, True Label: 0
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 0, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab 

In [None]:
#Abstract Factory with different settings

In [1]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: nonab (15).java, Predicted Label: 0, True Label: 0
File: nonab (1).java, Predicted Label: 1, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: nonab (10).java, Predicted Label: 0, True Label: 0
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted La

In [None]:
#Abstract Factory with different settings

In [2]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: nonab (10).java, Predicted Label: 0, True Label: 0
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: nonab (51).java, Predicted Label: 0, True Label: 0
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (69).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 0, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab (14).java, Predicted Label: 1, True Label: 0
File: nonab (36).java, Predicted Label: 0, True Label: 0
File: nonab (42).java, Predicted Label: 0, True Label: 0
File: abst

In [None]:
#Abstract Factory with different settings

In [3]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: nonab (22).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 0, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab (59).java, Predicted Label: 1, True Label: 0
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
File: nonab (32).java, Predicted Label: 1, True Label: 0
F

In [2]:
#Abstract factory with different settings

In [3]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Path to the directory containing your Java programs
directory = "abstractfactory"  # Change to your dataset directory
java_files = [file for file in os.listdir(directory) if file.endswith(".java")]

# Function to calculate the embedding of each line and take the mean
def get_program_embedding(program_text):
    lines = program_text.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Skip empty lines
            # Tokenize the line
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)

            # Get the model embeddings for the line
            with torch.no_grad():
                outputs = model(**inputs)

            # Use the 'last_hidden_state' attribute for embeddings
            line_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(line_embedding)

    # Calculate the mean embedding for all lines
    program_embedding = np.mean(line_embeddings, axis=0)
    return program_embedding

# Store the embeddings of each program
program_embeddings = []

# Define the true labels for each program
true_labels = []

for java_file in java_files:
    if "abstractfactory" in java_file:
        label = 1  # Positive class
    else:
        label = 0  # Negative class

    true_labels.append(label)

    with open(os.path.join(directory, java_file), "r", encoding="ISO-8859-1") as file:
        code = file.read()


    # Calculate the embedding for the entire program
    program_embedding = get_program_embedding(code)
    program_embeddings.append(program_embedding)

# Convert the list of embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'cosine'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
if distance_metric == 'cosine':
    distance_matrix = cosine_distances(program_embeddings)
elif distance_metric == 'euclidean':
    distance_matrix = euclidean_distances(program_embeddings)
else:
    raise ValueError(f"Unrecognized metric: {distance_metric}")

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision_knn = precision_score(true_labels, predicted_labels, average='weighted')
recall_knn = recall_score(true_labels, predicted_labels, average='weighted')
f1_knn = f1_score(true_labels, predicted_labels, average='weighted')

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision (KNN): {precision_knn:.2f}, Recall (KNN): {recall_knn:.2f}, F1 Score (KNN): {f1_knn:.2f}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1

In [None]:
#tsne plot for programs

In [1]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to get embeddings for a given design pattern
def get_embeddings_for_pattern(pattern, model, tokenizer):
    directory = os.path.join("all_design_patterns", pattern.lower())
    files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

    embeddings = []
    true_labels = []

    for file in files:
        with open(os.path.join(directory, file), "r", encoding="ISO-8859-1") as f:
            code = f.read()

        # Tokenize and encode the Java program
        inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        program_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        embeddings.append(program_embedding)
        true_labels.append(pattern)

    return np.array(embeddings), np.array(true_labels)

# Load the CodeGPT model and tokenizer
model_name = "AISE-TUDelft/CodeGPT-Multilingual"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get embeddings for each design pattern
patterns = ["Singleton", "Prototype", "AbstractFactory", "Builder", "FactoryMethod"]
all_embeddings = []
all_labels = []

# Custom color palette for each design pattern with higher contrast
color_palette = ["red", "green", "orange", "blue", "purple"]

# Custom markers for each design pattern
markers = ["o", "s", "D", "^", "P"]

for i, pattern in enumerate(patterns):
    pattern_embeddings, pattern_labels = get_embeddings_for_pattern(pattern, model, tokenizer)
    all_embeddings.append(pattern_embeddings)
    all_labels.append(pattern_labels)

# Concatenate the embeddings and labels
all_embeddings = np.concatenate(all_embeddings, axis=0)
all_labels = np.concatenate(all_labels)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(all_embeddings)

# Create a scatter plot for t-SNE visualization with custom symbols
plt.figure(figsize=(20, 16))

for i, pattern in enumerate(patterns):
    indices = all_labels == pattern
    sns.scatterplot(x=tsne_results[indices, 0], y=tsne_results[indices, 1], marker=markers[i], color=color_palette[i], s=200, label=pattern)

# Increase font sizes for better visibility
plt.title('t-SNE Visualization for CodeGPT on Different Design Patterns', fontsize=30)
plt.xlabel('t-SNE Dimension 1', fontsize=25)
plt.ylabel('t-SNE Dimension 2', fontsize=25)
plt.legend(title='Design Pattern', loc='upper right', fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Save the t-SNE plot as a PDF file
plt.savefig('tsne_plot_codegpt.pdf', format='pdf')
plt.show()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
