In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and F1 score
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 0
File: nons (25).java, Actual Label: 0, Predicted Label: 1
File: nons (18).java, Actual Label: 0, Predicted Label: 0
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (55).java, Actual Label: 0, Predicted Label: 0
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 0
File: nons (68).java, Actual Label: 0, Predicted Label: 0
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: nons (30).java, Actual Label: 0, Predi

In [None]:
Singleton with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 0
File: nons (25).java, Actual Label: 0, Predicted Label: 1
File: nons (18).java, Actual Label: 0, Predicted Label: 0
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (55).java, Actual Label: 0, Predicted Label: 0
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 0
File: nons (68).java, Actual Label: 0, Predicted Label: 0
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: nons (30).java, Actual Label: 0, Predi

In [None]:
Singleton with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (50).java, Actual Label: 0, Predicted Label: 1
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: singleton (24).java, Actual Label: 1, Predicted Label: 1
File: nons (4).java, Actual Label: 0, Predicted Label: 0
File: singleton (9).java, Actual Label: 1, Predicted Label: 1
File: nons (44).java, Actual Label: 0, Predicted Label: 0
File: nons (38).java, Actual Label: 0, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (41).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: singleton (3).java, Actual Label: 1, Predicted Label: 1
File: singleton (23).java, Actual Label: 1, Predicted Label: 1
File: singleton (16).java, Actual Label: 1, Predicted Label: 1
File: singleton (13).java, Actual Label: 1, Predicted Label: 1
File: singleton (11).

In [None]:
Singleton with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 0
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 1
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: nons (30).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: singleton (3).java, Actual Label: 1, Predicted Label: 0
File: nons (53).java, Actual Label: 0, Predicted Label: 0
File: singleton (23).java, Actual La

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 1
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 1
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 1
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: singleton (3).java, Actual Label: 1, Predicted Label: 0
File: nons (53).java, Actual Label: 0, Predicted Label: 0
File: singleton (23).java, Actual Label: 1, Predicted Label: 0
File: nons (17).java, Actual Label: 0, Predicted Label: 0
File: singleton (16).java, Actu

In [None]:
Builder with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonb (14).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (41).java, Actual Label: 0, Predicted Label: 0
File: nonb (5).java, Actual Label: 0, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (110).java, Actual Label: 0, Predicted Label: 0
File: nonb (18).java, Actual Label: 0, Predicted Label: 0
File: nonb (17).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: nonb (85).java, Actual Label: 0, Predicted Label: 1
File: builder (7).java, Actual Label: 1, Predicted Label: 1
File: nonb (92).java, Actual Label: 0, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (13).java, Actual Label: 0, Predicted Label: 0
File: builder (8).java, Actual Label: 1, Predicted Label: 1
File: nonb (23).java, Actual Label: 0, Predicted Label: 0


In [None]:
#Builder with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonb (14).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (110).java, Actual Label: 0, Predicted Label: 0
File: nonb (18).java, Actual Label: 0, Predicted Label: 0
File: nonb (17).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: builder (7).java, Actual Label: 1, Predicted Label: 0
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (13).java, Actual Label: 0, Predicted Label: 0
File: nonb (119).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label: 0
File: nonb (23).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Label: 0
File: nonb (93).java, Actual Label: 0, Predicted Lab

In [None]:
#Builder with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (89).java, Actual Label: 0, Predicted Label: 0
File: nonb (49).java, Actual Label: 0, Predicted Label: 0
File: nonb (28).java, Actual Label: 0, Predicted Label: 0
File: nonb (36).java, Actual Label: 0, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: nonb (126).java, Actual Label: 0, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (44).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: nonb (120).java, Actual Label: 0, Predicted Label: 0
File: nonb (69).java, Actual Label: 0, Predicted Label: 0
File: builder (7).java, Actual Label: 1, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (129).java, Actual Label: 0, Predicted Label: 0
File: nonb (114).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label

In [None]:
#Builder with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonb (14).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (110).java, Actual Label: 0, Predicted Label: 0
File: nonb (18).java, Actual Label: 0, Predicted Label: 0
File: nonb (17).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: builder (7).java, Actual Label: 1, Predicted Label: 0
File: nonb (92).java, Actual Label: 0, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (13).java, Actual Label: 0, Predicted Label: 0
File: nonb (119).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label: 0
File: nonb (23).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Lab

In [None]:
#Builder with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (50).java, Actual Label: 0, Predicted Label: 0
File: nonb (36).java, Actual Label: 0, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (9).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: nonb (87).java, Actual Label: 0, Predicted Label: 0
File: nonb (29).java, Actual Label: 0, Predicted Label: 0
File: builder (7).java, Actual Label: 1, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (123).java, Actual Label: 0, Predicted Label: 0
File: nonb (129).java, Actual Label: 0, Predicted Label: 0
File: nonb (114).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label: 1
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Lab

In [None]:
#abstract factory with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (34).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (10).java, Actual Label: 0, Predicted Label: 0
File: nonfm (37).java, Actual Label: 0, Predicted Label: 0
File: nonfm (13).java, Actual Label: 0, Predicted Label: 1
File: nonfm (17).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 0
File: nonfm (49).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: nonfm (7).java, Actual L

In [None]:
#abstract factory with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonab (22).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (19).java, Actual Label: 0, Predicted Label: 1
File: nonab (30).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: nonab (70).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (2).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: nonab (3).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: nonab (14).java, Actual La

In [None]:
#abstract factory with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (5).java, Actual Label: 0, Predicted Label: 0
File: nonab (24).java, Actual Label: 0, Predicted Label: 0
File: nonab (65).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: nonab (10).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: nonab (33).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (86).java,

In [None]:
#abstract factory with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonab (11).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (5).java, Actual Label: 0, Predicted Label: 0
File: nonab (7).java, Actual Label: 0, Predicted Label: 0
File: nonab (15).java, Actual Label: 0, Predicted Label: 0
File: nonab (1).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: nonab (10).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (2).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: nonab (8).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: nonab (3).java, Actual Label: 0, Predicted Label

In [None]:
#abstract factory with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonab (11).java, Actual Label: 0, Predicted Label: 0
File: nonab (22).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (28).java, Actual Label: 0, Predicted Label: 0
File: nonab (65).java, Actual Label: 0, Predicted Label: 1
File: nonab (15).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (32).java

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: nonp (51).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: nonp (29).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (37).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: nonp (22).java, Actual Label: 0, 

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (59).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 1
File: nonp (58).java, Actual Label: 0, Predicted Label: 1
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: nonp (22).java, Actual Label: 0, Predicted Label: 1
File: prototype (32).java, Actual Label: 1, Predicted Label: 1
File: prototype (6).java, Actual La

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (59).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 1
File: nonp (58).java, Actual Label: 0, Predicted Label: 1
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: nonp (22).java, Actual Label: 0, Predicted Label: 1
File: prototype (32).java, Actual Label: 1, Predicted Label: 1
File: prototype (6).java, Actual La

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (59).java, Actual Label: 0, Predicted Label: 0
File: nonp (31).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (44).java, Actual Label: 0, Predicted Label: 0
File: nonp (37).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: nonp (36).java, Actual Label: 0, Predicted Label: 0
File: nonp (58).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: nonp (28).java, Actual Label: 0, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: prototype (32).java, Actual Lab

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (23).java, Actual Label: 0, Predicted Label: 0
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: nonp (51).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (44).java, Actual Label: 0, Predicted Label: 0
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: nonp (36).java, Actual Label: 0, Predicted Label: 0
File: nonp (58).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted L

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (23).java, Actual Label: 0, Predicted Label: 0
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: nonp (51).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (44).java, Actual Label: 0, Predicted Label: 0
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: nonp (36).java, Actual Label: 0, Predicted Label: 0
File: nonp (58).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted L

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (13).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 0
File: nonfm (12).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (18).java, Actual Label: 0, Predicted Label: 0
File: nonfm (8).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: nonfm (11).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (14).java, Actual Label: 0, Predicted Label: 0
File: nonfm (17).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 0
File: nonfm (9).java, Actual Label: 0, Predicted Label: 1
F

In [None]:
#Factory method with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (80).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (78).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (82).java, Actual Label: 0, Predicted Label: 1
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: nonfm (81).java, Actual Label: 0, Predicted Label: 1

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (6).java, Actual Label: 0, Predicted Label: 0
File: nonfm (5).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (4).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (2).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (1).java, Actual Label: 0, Predicted Label: 0
File: nonfm (8).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (8).java, Actual Label: 1, Predicted

In [None]:
#Factory method with different settingsactual_labels

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (80).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (78).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (11).java, Actual Label: 1, Predicted 

In [None]:
#Factory method with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (68).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (70).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (82).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: nonfm (81).java, Actual Label: 0, Predicted Label: 1

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (80).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (78).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (82).java, Actual Label: 0, Predicted Label: 1
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1

In [None]:
#To plot t-SNE

In [None]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.0
You should consider upgrading via the '/apps/Arch/software/Python/3.10.4-GCCcore-11.3.0/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to get embeddings for a given design pattern
def get_embeddings_for_pattern(pattern, model, tokenizer):
    directory = os.path.join("all_design_patterns", pattern.lower())
    files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

    embeddings = []
    true_labels = []

    for file in files:
        with open(os.path.join(directory, file), "r", encoding="ISO-8859-1") as f:
            code = f.read()

        # Tokenize and encode the Java program
        inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        program_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        embeddings.append(program_embedding)
        true_labels.append(pattern)

    return np.array(embeddings), np.array(true_labels)

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get embeddings for each design pattern
patterns = ["Singleton", "Prototype", "AbstractFactory", "Builder", "FactoryMethod"]
all_embeddings = []
all_labels = []

# Custom color palette for each design pattern with higher contrast
color_palette = ["red", "green", "orange", "blue", "purple"]

# Custom markers for each design pattern
markers = ["o", "s", "D", "^", "P"]

for i, pattern in enumerate(patterns):
    pattern_embeddings, pattern_labels = get_embeddings_for_pattern(pattern, model, tokenizer)
    all_embeddings.append(pattern_embeddings)
    all_labels.append(pattern_labels)

# Concatenate the embeddings and labels
all_embeddings = np.concatenate(all_embeddings, axis=0)
all_labels = np.concatenate(all_labels)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(all_embeddings)

# Create a scatter plot for t-SNE visualization with custom symbols
plt.figure(figsize=(20, 16))

for i, pattern in enumerate(patterns):
    indices = all_labels == pattern
    sns.scatterplot(x=tsne_results[indices, 0], y=tsne_results[indices, 1], marker=markers[i], color=color_palette[i], s=200, label=pattern)

# Increase font sizes for better visibility
plt.title('t-SNE Visualization for CodeBERT on Different Design Patterns', fontsize=30)
plt.xlabel('t-SNE Dimension 1', fontsize=25)
plt.ylabel('t-SNE Dimension 2', fontsize=25)
plt.legend(title='Design Pattern', loc='upper right', fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Save the t-SNE plot as a PDF file
plt.savefig('tsne_plot_updated_symbols_colors.pdf', format='pdf')
plt.show()


testing the time module

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the training time (embedding generation)
start_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure overall execution time
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time: {execution_time_ms:.2f} ms")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



Training (Embedding Generation) Time: 1280426.65 ms
Prediction Time: 187.51 ms
File: nonfm (15).java, Actual Label: 0, Predicted Label: 0
File: nonfm (9).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: nonfm (6).java, Actual Label: 0, Predicted Label: 0
File: nonfm (4).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (13).java, Actual Label: 0, Predicted Label: 0
File: nonfm (16).java, Actual Label: 0, Predicted Label: 0
File: nonfm (14).java, Actual Label: 0, Predicted Label: 0
File: nonfm (3).java, Actual Label: 0, Predicted Label: 0
File: nonfm (5).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (11).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File

**Factory method time calcution with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the training time (embedding generation)
start_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure overall execution time
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time: {execution_time_ms:.2f} ms")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



Training (Embedding Generation) Time: 364704.67 ms
Prediction Time: 101.21 ms
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (82).java, Actual Label: 0, Predicted Label: 1
File: nonfm (81).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: nonfm (73).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: nonfm (76).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (80).java, Actual Label: 0, Predicted La

**Time for singleton**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the training time (embedding generation)
start_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure overall execution time
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time: {execution_time_ms:.2f} ms")




Training (Embedding Generation) Time: 376635.22 ms
Prediction Time: 3.27 ms
File: factorymethod (4).java, Actual Label: 0, Predicted Label: 0
File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (82).java, Actual Label: 0, Predicted Label: 0
File: nonfm (81).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 0, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: nonfm (73).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 0, Predicted Label: 0
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: nonfm (76).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (11).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 0, Predicted Label: 0
File: nonfm (80).java, Actual Label: 0, Predicted Labe

**Singleton time calculation with different settings**

**Singleton with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 122461.56 ms
Prediction Time: 8.67 ms
File: singleton (15).java, Actual Label: 1, Predicted Label: 1
File: singleton (20).java, Actual Label: 1, Predicted Label: 1
File: nons (45).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: nons (24).java, Actual Label: 0, Predicted Label: 1
File: nons (12).java, Actual Label: 0, Predicted Label: 0
File: nons (64).java, Actual Label: 0, Predicted Label: 0
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: singleton (16).java, Actual Label: 1, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 0
File: nons (37).java, Actual Label: 0, Predicted Label: 1
File: nons (6).java, Actual Label: 0, Predicted Label: 0
File: nons (62).java, Actual Label: 0, Predicted Label: 0
File: singleton (6).java, Actual Label: 1, Predicted Label: 1
File: nons (16).java, Actual Label: 0, Predicted Label: 0
File: nons (33).java, 

**Singleton time calculation with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 97906.88 ms
Prediction Time: 3.52 ms
File: singleton (15).java, Actual Label: 1, Predicted Label: 1
File: singleton (20).java, Actual Label: 1, Predicted Label: 1
File: nons (45).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: nons (53).java, Actual Label: 0, Predicted Label: 0
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: singleton (16).java, Actual Label: 1, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (37).java, Actual Label: 0, Predicted Label: 1
File: nons (54).java, Actual Label: 0, Predicted Label: 0
File: singleton (6).java, Actual Label: 1, Predicted Label: 1
File: nons (48).java, Actual Label: 0, Predicted Label: 0
File: nons (34).java, Actual Label: 0, Predicted Label: 1
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: singleton (19).java, Actual Label: 1, Predicted Label: 0
File: singleton (

**Singleton time calculation with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 107471.01 ms
Prediction Time: 3.39 ms
File: singleton (15).java, Actual Label: 1, Predicted Label: 1
File: singleton (20).java, Actual Label: 1, Predicted Label: 1
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: nons (53).java, Actual Label: 0, Predicted Label: 0
File: nons (12).java, Actual Label: 0, Predicted Label: 0
File: nons (60).java, Actual Label: 0, Predicted Label: 0
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: singleton (16).java, Actual Label: 1, Predicted Label: 0
File: nons (14).java, Actual Label: 0, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (63).java, Actual Label: 0, Predicted Label: 0
File: nons (55).java, Actual Label: 0, Predicted Label: 0
File: nons (37).java, Actual Label: 0, Predicted Label: 1
File: nons (62).java, Actual Label: 0, Predicted Label: 0
File: singleton (6).java, Actual Label: 1, Predicted Label: 1
File: singleton (19).

**Factory method time calculation using different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda
Training (Embedding Generation) Time: 84570.35 ms
Prediction Time: 2.57 ms
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (3).java, Actual Label: 0, Predicted Label: 0
File: nonfm (20).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (18).java, Actual Label: 0, Predicted Label: 0
File: nonfm (23).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: nonfm (15).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (16).java, Actual Label: 0, Predicted Label: 1
File: nonfm (24).java, Actual Label: 0, Predicted Label: 0
File: nonfm (4).java, Actual Label: 0, Predicted Label: 0
File: nonfm (25).java, Actual Label: 0

**Factory method with different settings **

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 46214.70 ms
Prediction Time: 3.54 ms
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (68).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: nonfm (42).java, Actual Label: 0, Predicted Label: 0
File: nonfm (49).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (76).java, Actual Label: 0, Predicted Label: 0
File: nonfm (32).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (24).java, Actual Label: 0, Predicted Label: 0
File: nonfm (67).java, Actual Label: 0, Predicted Label: 0
File: nonfm (70).java, Actual Label: 0, Predicted Label

**Factory method with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 78577.67 ms
Prediction Time: 2.81 ms
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (35).java, Actual Label: 0, Predicted Label: 1
File: nonfm (34).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: nonfm (27).java, Actual Label: 0, Predicted Label: 0
File: nonfm (29).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (23).java, Actual Label: 0, Predicted Label: 0
File: nonfm (32).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (30).java, Actual Label: 0, Predicted Label: 0
File: nonfm (24).java, Actual Label: 0, Predicted Label: 0
File: nonfm (33).java, Actual Label: 0, Predicted Label

**Factory method with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 79601.84 ms
Prediction Time: 3.21 ms
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (20).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: nonfm (22).java, Actual Label: 0, Predicted Label: 0
File: nonfm (27).java, Actual Label: 0, Predicted Label: 0
File: nonfm (29).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (18).java, Actual Label: 0, Predicted Label: 0
File: nonfm (23).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: nonfm (15).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (16).java, Actual Label: 0, Predicted Label: 1
File: nonfm (24).java, Actual Label: 0, Predicted Label

**Singleton with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 195491.85 ms
Prediction Time: 4.71 ms
File: singleton (15).java, Actual Label: 1, Predicted Label: 1
File: singleton (20).java, Actual Label: 1, Predicted Label: 1
File: nons (45).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: nons (21).java, Actual Label: 0, Predicted Label: 0
File: nons (24).java, Actual Label: 0, Predicted Label: 0
File: nons (64).java, Actual Label: 0, Predicted Label: 0
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: singleton (16).java, Actual Label: 1, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (63).java, Actual Label: 0, Predicted Label: 0
File: nons (62).java, Actual Label: 0, Predicted Label: 0
File: singleton (6).java, Actual Label: 1, Predicted Label: 1
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: singleton (19).

**Singleton time calcultion with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 98509.27 ms
Prediction Time: 5.50 ms
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (20).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: nonfm (22).java, Actual Label: 0, Predicted Label: 0
File: nonfm (27).java, Actual Label: 0, Predicted Label: 0
File: nonfm (29).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (18).java, Actual Label: 0, Predicted Label: 0
File: nonfm (23).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: nonfm (15).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (16).java, Actual Label: 0, Predicted Label: 1
File: nonfm (24).java, Actual Label: 0, Predicted Label

**Factory method time calculation with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 95059.75 ms
Prediction Time: 2.87 ms
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (20).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: nonfm (22).java, Actual Label: 0, Predicted Label: 0
File: nonfm (27).java, Actual Label: 0, Predicted Label: 0
File: nonfm (29).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (18).java, Actual Label: 0, Predicted Label: 0
File: nonfm (23).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: nonfm (15).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (16).java, Actual Label: 0, Predicted Label: 1
File: nonfm (24).java, Actual Label: 0, Predicted Label

**Singleton with different settings**





In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 133244.40 ms
Prediction Time: 3.41 ms
File: singleton (15).java, Actual Label: 1, Predicted Label: 1
File: singleton (20).java, Actual Label: 1, Predicted Label: 1
File: nons (68).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: nons (24).java, Actual Label: 0, Predicted Label: 1
File: nons (64).java, Actual Label: 0, Predicted Label: 0
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: singleton (16).java, Actual Label: 1, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (18).java, Actual Label: 0, Predicted Label: 0
File: nons (26).java, Actual Label: 0, Predicted Label: 0
File: singleton (6).java, Actual Label: 1, Predicted Label: 1
File: nons (48).java, Actual Label: 0, Predicted Label: 1
File: singleton (19).java, Actual Label: 1, Predicted Label: 0
File: singleton (12).java, Actual Label: 1, Predicted Label: 0
File: nons 

**Builder with settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 75283.62 ms
Prediction Time: 2.94 ms
File: nonb (13).java, Actual Label: 0, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: nonb (11).java, Actual Label: 0, Predicted Label: 0
File: nonb (10).java, Actual Label: 0, Predicted Label: 0
File: nonb (15).java, Actual Label: 0, Predicted Label: 0
File: nonb (16).java, Actual Label: 0, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (9).java, Actual Label: 0, Predicted Label: 0
File: builder (7).java, Actual Label: 1, Predicted Label: 1
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: nonb (17).java, Actual Label: 0, Predicted Label: 0
File: nonb (14).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: builder (8).java, Actual Label: 1, P

**Builder time calculation with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



Training (Embedding Generation) Time: 45103.20 ms
Prediction Time: 49.39 ms
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (1).java, Actual Label: 0, Predicted Label: 1
File: nonb (2).java, Actual Label: 0, Predicted Label: 0
File: nonb (5).java, Actual Label: 0, Predicted Label: 0
File: builder (8).java, Actual Label: 1, Predicted Label: 1
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: nonb (9).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: nonb (3).java, Actual Label: 0, Predicted Label: 0
File: nonb (7).java, Actual Label: 0, Predicted Label: 0
File: nonb (4).java, Actual Label: 0, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (6).java, Actual Label: 0, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predict

**Builder with different seetings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 80378.07 ms
Prediction Time: 3.09 ms
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (1).java, Actual Label: 0, Predicted Label: 1
File: nonb (2).java, Actual Label: 0, Predicted Label: 0
File: nonb (5).java, Actual Label: 0, Predicted Label: 0
File: builder (8).java, Actual Label: 1, Predicted Label: 1
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: nonb (9).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: nonb (3).java, Actual Label: 0, Predicted Label: 0
File: nonb (7).java, Actual Label: 0, Predicted Label: 0
File: nonb (4).java, Actual Label: 0, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (6).java, Actual Label: 0, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predicte

**Builder with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 69154.35 ms
Prediction Time: 2.89 ms
File: nonb (28).java, Actual Label: 0, Predicted Label: 0
File: builder (9).java, Actual Label: 1, Predicted Label: 0
File: nonb (27).java, Actual Label: 0, Predicted Label: 1
File: nonb (34).java, Actual Label: 0, Predicted Label: 1
File: nonb (30).java, Actual Label: 0, Predicted Label: 0
File: nonb (26).java, Actual Label: 0, Predicted Label: 0
File: nonb (35).java, Actual Label: 0, Predicted Label: 0
File: builder (8).java, Actual Label: 1, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 0
File: nonb (32).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Label: 0
File: nonb (29).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 0
File: builder (6).java, Actual Label: 1, 

**Builder with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 67862.53 ms
Prediction Time: 3.15 ms
File: nonb (128).java, Actual Label: 0, Predicted Label: 0
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (129).java, Actual Label: 0, Predicted Label: 0
File: nonb (131).java, Actual Label: 0, Predicted Label: 0
File: builder (8).java, Actual Label: 1, Predicted Label: 1
File: builder (3).java, Actual Label: 1, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: nonb (130).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: nonb (125).java, Actual Label: 0, Predicted Label: 0
File: nonb (132).java, Actual Label: 0, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (133).java, Actual Label: 0, Predicted Label: 0
File: builder (7).java, Actual L

**Builder with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 127342.93 ms
Prediction Time: 3.13 ms
File: nonb (92).java, Actual Label: 0, Predicted Label: 0
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (93).java, Actual Label: 0, Predicted Label: 0
File: nonb (108).java, Actual Label: 0, Predicted Label: 1
File: nonb (131).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (116).java, Actual Label: 0, Predicted Label: 1
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (119).java, Actual Label: 0, Predicted Label: 1
File: nonb (20).java, Actual Label: 0, Predicted Label: 0
File: nonb (17).java, Actual Label

**Prototyope with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 152617.35 ms
Prediction Time: 10.73 ms
File: prototype (8).java, Actual Label: 1, Predicted Label: 1
File: nonp (28).java, Actual Label: 0, Predicted Label: 0
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: prototype (31).java, Actual Label: 1, Predicted Label: 1
File: prototype (19).java, Actual Label: 1, Predicted Label: 1
File: nonp (29).java, Actual Label: 0, Predicted Label: 0
File: prototype (20).java, Actual Label: 1, Predicted Label: 1
File: nonp (16).java, Actual Label: 0, Predicted Label: 0
File: nonp (4).java, Actual Label: 0, Predicted Label: 0
File: prototype (17).java, Actual Label: 1, Predicted Label: 1
File: prototype (12).java, Actual Label: 1, Predicted Label: 1
File: prototype (9).java, Actual Label: 1, Predicted Label: 1
File: nonp (18).java, Actual Label: 0, Predicted Label: 0
File: nonp (14).java, Actual Label: 0, Predicted Label: 0
File: proto

**Prototype with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 153786.84 ms
Prediction Time: 3.82 ms
File: prototype (8).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: prototype (31).java, Actual Label: 1, Predicted Label: 1
File: prototype (19).java, Actual Label: 1, Predicted Label: 1
File: nonp (42).java, Actual Label: 0, Predicted Label: 0
File: nonp (29).java, Actual Label: 0, Predicted Label: 0
File: prototype (20).java, Actual Label: 1, Predicted Label: 1
File: nonp (48).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: prototype (17).java, Actual Label: 1, Predicted Label: 1
File: prototype (12).java, Actual Label: 1, Predicted Label: 0
File: prototype (9).java, Actual Label: 1, Predicted Label: 1
File: nonp (36).java, Actual Label: 0, Predicted Label: 0
File: prototype (29).java, Actual Label: 1, Predicted Label: 0
File: 

**Prototype with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 164954.88 ms
Prediction Time: 3.67 ms
File: prototype (8).java, Actual Label: 1, Predicted Label: 1
File: nonp (28).java, Actual Label: 0, Predicted Label: 0
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: prototype (31).java, Actual Label: 1, Predicted Label: 1
File: prototype (19).java, Actual Label: 1, Predicted Label: 1
File: nonp (29).java, Actual Label: 0, Predicted Label: 0
File: prototype (20).java, Actual Label: 1, Predicted Label: 1
File: nonp (16).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: prototype (17).java, Actual Label: 1, Predicted Label: 1
File: prototype (12).java, Actual Label: 1, Predicted Label: 1
File: prototype (9).java, Actual Label: 1, Predicted Label: 1
File: nonp (18).java, Actual Label: 0, Predicted Label: 0
File: nonp (14).java, Actual Label: 0, Predicted Label: 0
File: nonp 

**Prototype with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 175072.66 ms
Prediction Time: 4.65 ms
File: prototype (8).java, Actual Label: 1, Predicted Label: 1
File: nonp (28).java, Actual Label: 0, Predicted Label: 0
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: prototype (31).java, Actual Label: 1, Predicted Label: 1
File: prototype (19).java, Actual Label: 1, Predicted Label: 1
File: nonp (29).java, Actual Label: 0, Predicted Label: 0
File: prototype (20).java, Actual Label: 1, Predicted Label: 1
File: nonp (16).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: prototype (17).java, Actual Label: 1, Predicted Label: 1
File: prototype (12).java, Actual Label: 1, Predicted Label: 1
File: prototype (9).java, Actual Label: 1, Predicted Label: 1
File: nonp (18).java, Actual Label: 0, Predicted Label: 0
File: nonp (14).java, Actual Label: 0, Predicted Label: 0
File: nonp 

**Prototype with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 134111.86 ms
Prediction Time: 3.49 ms
File: prototype (8).java, Actual Label: 1, Predicted Label: 1
File: nonp (28).java, Actual Label: 0, Predicted Label: 0
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: prototype (31).java, Actual Label: 1, Predicted Label: 1
File: prototype (19).java, Actual Label: 1, Predicted Label: 1
File: nonp (42).java, Actual Label: 0, Predicted Label: 0
File: nonp (29).java, Actual Label: 0, Predicted Label: 0
File: prototype (20).java, Actual Label: 1, Predicted Label: 1
File: prototype (17).java, Actual Label: 1, Predicted Label: 1
File: prototype (12).java, Actual Label: 1, Predicted Label: 0
File: prototype (9).java, Actual Label: 1, Predicted Label: 1
File: prototype (29).java, Actual Label: 1, Predicted Label: 0
File: nonp (45).java, Actual Label: 0, Predicted Label: 0
File: nonp (52).java, Actual Label: 0, Predicted Label: 1
File: 

**Abstract Factory with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 133009.30 ms
Prediction Time: 3.20 ms
File: nonab (10).java, Actual Label: 0, Predicted Label: 0
File: nonab (14).java, Actual Label: 0, Predicted Label: 0
File: nonab (15).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (8).java, Actual Label: 1, Predicted Label: 1
File: nonab (4).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: nonab (9).java, Actual Label: 0, Predicted Label: 0
File: nonab (6).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: nonab (17).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: nonab (16).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (13).java, Actual Label: 0, Predi

**Abstract Factory with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 85384.84 ms
Prediction Time: 3.83 ms
File: nonab (76).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (8).java, Actual Label: 1, Predicted Label: 1
File: nonab (73).java, Actual Label: 0, Predicted Label: 0
File: nonab (71).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: nonab (70).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (74).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (69).java, Actual Label: 0, Predicted Label: 0
File: nonab (67).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (11).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (2).java, A

**Abstract factory with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 95415.50 ms
Prediction Time: 3.61 ms
File: nonab (83).java, Actual Label: 0, Predicted Label: 0
File: nonab (76).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (8).java, Actual Label: 1, Predicted Label: 1
File: nonab (80).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: nonab (81).java, Actual Label: 0, Predicted Label: 1
File: nonab (86).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (74).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (87).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (11).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (2).java, A

**Abstract Factory using different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 98124.97 ms
Prediction Time: 2.91 ms
File: nonab (76).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (8).java, Actual Label: 1, Predicted Label: 1
File: nonab (63).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: nonab (81).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (34).java, Actual Label: 0, Predicted Label: 0
File: nonab (74).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (69).java, Actual Label: 0, Predicted Label: 0
File: nonab (52).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (11).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (2).java, A

**abstract factory with different settings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 50682.77 ms
Prediction Time: 3.28 ms
File: nonab (50).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (8).java, Actual Label: 1, Predicted Label: 1
File: nonab (33).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: nonab (51).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (34).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (52).java, Actual Label: 0, Predicted Label: 1
File: nonab (49).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (11).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (1

**Abstract Factory with different settiings**

In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Measure the total execution time
start_time = time.time()

# Measure the training time (embedding generation)
start_training_time = time.time()

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Measure the end of training time
end_training_time = time.time()
training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

# Measure the prediction time
prediction_start_time = time.time()

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Measure the end of prediction time
prediction_end_time = time.time()
prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Measure total execution time (from the start of the script to the end of prediction)
overall_end_time = time.time()
execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")


Using device: cuda




Training (Embedding Generation) Time: 96412.41 ms
Prediction Time: 3.07 ms
File: nonab (83).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (8).java, Actual Label: 1, Predicted Label: 1
File: nonab (80).java, Actual Label: 0, Predicted Label: 0
File: nonab (73).java, Actual Label: 0, Predicted Label: 0
File: nonab (71).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: nonab (70).java, Actual Label: 0, Predicted Label: 0
File: nonab (81).java, Actual Label: 0, Predicted Label: 1
File: nonab (86).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (74).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (69).java, Actual Label: 0, Pre

In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Function to calculate standard deviation
def calculate_std(values):
    return np.std(values)

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Running the experiment multiple times to calculate mean and standard deviation
n_runs = 10  # Change this value to 10-30 based on reviewer comment
precision_values = []
recall_values = []
f1_values = []

for run in range(n_runs):
    print(f"Run {run + 1} of {n_runs}")

    program_embeddings = []
    true_labels = []

    # Measure training time for each run
    start_training_time = time.time()

    for file in java_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
            java_code = f.read()

            # Get the embedding of the program by taking the mean of line embeddings
            program_embedding = get_line_embeddings(java_code)
            program_embeddings.append(program_embedding)

            # Define true labels based on the file names
            true_labels.append(1 if "abstractfactory" in file else 0)

    # Convert program_embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
    k = 5  # Adjust this value as needed
    neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
    neighbors.fit(program_embeddings)
    _, indices = neighbors.kneighbors(program_embeddings)

    # Initialize arrays to store actual and predicted labels
    actual_labels = np.array(true_labels)
    predicted_labels = np.zeros_like(actual_labels)

    # Predict labels for each program
    for i in range(len(java_files)):
        # Get the indices of the k-nearest neighbors (excluding the program itself)
        neighbor_indices = indices[i, 1:]

        # Get the labels of the neighbors
        neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

        # Assign the majority label to the program
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels[i] = predicted_label

    # Calculate precision, recall, and F1 score for this run
    precision = precision_score(actual_labels, predicted_labels, average='weighted')
    recall = recall_score(actual_labels, predicted_labels, average='weighted')
    f1 = f1_score(actual_labels, predicted_labels, average='weighted')

    precision_values.append(precision)
    recall_values.append(recall)
    f1_values.append(f1)

# Calculate mean and standard deviation for precision, recall, and F1 score
precision_mean = np.mean(precision_values)
recall_mean = np.mean(recall_values)
f1_mean = np.mean(f1_values)

precision_std = calculate_std(precision_values)
recall_std = calculate_std(recall_values)
f1_std = calculate_std(f1_values)

# Print results
print(f"\nMean Precision: {precision_mean:.2f} (±{precision_std:.2f})")
print(f"Mean Recall: {recall_mean:.2f} (±{recall_std:.2f})")
print(f"Mean F1 Score: {f1_mean:.2f} (±{f1_std:.2f})")


Using device: cuda




Run 1 of 10
Run 2 of 10
Run 3 of 10
Run 4 of 10
Run 5 of 10
Run 6 of 10
Run 7 of 10
Run 8 of 10
Run 9 of 10
Run 10 of 10

Mean Precision: 0.87 (±0.00)
Mean Recall: 0.85 (±0.00)
Mean F1 Score: 0.85 (±0.00)


In [None]:
import os
import torch
import numpy as np
import time  # Import time for measuring execution time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors
import random

# Function to calculate standard deviation
def calculate_std(values):
    return np.std(values)

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Separate positive and negative class files
positive_files = [file for file in java_files if "builder" in file.lower()]  # Positive class contains 'builder'
negative_files = [file for file in java_files if "builder" not in file.lower()]  # Negative class

# Running the experiment multiple times to calculate mean and standard deviation
n_runs = 10  # Number of iterations (adjust as needed)
precision_values = []
recall_values = []
f1_values = []

for run in range(n_runs):
    print(f"Run {run + 1} of {n_runs}")

    # Randomly sample the same number of positive and negative examples
    num_positive = len(positive_files)
    sampled_negative_files = random.sample(negative_files, num_positive)

    sampled_files = positive_files + sampled_negative_files
    random.shuffle(sampled_files)  # Shuffle to avoid any ordering bias

    program_embeddings = []
    true_labels = []

    # Measure the total execution time
    start_time = time.time()

    # Measure the training time (embedding generation)
    start_training_time = time.time()

    for file in sampled_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
            java_code = f.read()

            # Get the embedding of the program by taking the mean of line embeddings
            program_embedding = get_line_embeddings(java_code)
            program_embeddings.append(program_embedding)

            # Define true labels (1 for positive, 0 for negative)
            true_labels.append(1 if "builder" in file.lower() else 0)

    # Convert program_embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Measure the end of training time
    end_training_time = time.time()
    training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

    # Measure the prediction time
    prediction_start_time = time.time()

    # Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
    k = 5  # Adjust this value as needed
    neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
    neighbors.fit(program_embeddings)
    _, indices = neighbors.kneighbors(program_embeddings)

    # Initialize arrays to store actual and predicted labels
    actual_labels = np.array(true_labels)
    predicted_labels = np.zeros_like(actual_labels)

    # Predict labels for each program
    for i in range(len(sampled_files)):
        # Get the indices of the k-nearest neighbors (excluding the program itself)
        neighbor_indices = indices[i, 1:]

        # Get the labels of the neighbors
        neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

        # Assign the majority label to the program
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels[i] = predicted_label

    # Measure the end of prediction time
    prediction_end_time = time.time()
    prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
    print(f"Prediction Time: {prediction_time_ms:.2f} ms")

    # Calculate precision, recall, and F1 score for this run
    precision = precision_score(actual_labels, predicted_labels, average='weighted')
    recall = recall_score(actual_labels, predicted_labels, average='weighted')
    f1 = f1_score(actual_labels, predicted_labels, average='weighted')

    precision_values.append(precision)
    recall_values.append(recall)
    f1_values.append(f1)

    # Measure total execution time (from the start of the script to the end of prediction)
    overall_end_time = time.time()
    execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
    print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")

# Calculate mean and standard deviation for precision, recall, and F1 score
precision_mean = np.mean(precision_values)
recall_mean = np.mean(recall_values)
f1_mean = np.mean(f1_values)

precision_std = calculate_std(precision_values)
recall_std = calculate_std(recall_values)
f1_std = calculate_std(f1_values)

# Print results
print(f"\nMean Precision: {precision_mean:.2f} (±{precision_std:.2f})")
print(f"Mean Recall: {recall_mean:.2f} (±{recall_std:.2f})")
print(f"Mean F1 Score: {f1_mean:.2f} (±{f1_std:.2f})")


Using device: cuda




Run 1 of 10
Training (Embedding Generation) Time: 44903.97 ms
Prediction Time: 1.49 ms
Total Execution Time (Training + Prediction + Misc.): 44913.16 ms
Run 2 of 10
Training (Embedding Generation) Time: 40218.23 ms
Prediction Time: 2.11 ms
Total Execution Time (Training + Prediction + Misc.): 40226.85 ms
Run 3 of 10
Training (Embedding Generation) Time: 42333.74 ms
Prediction Time: 2.63 ms
Total Execution Time (Training + Prediction + Misc.): 42342.60 ms
Run 4 of 10
Training (Embedding Generation) Time: 39591.39 ms
Prediction Time: 1.60 ms
Total Execution Time (Training + Prediction + Misc.): 39601.08 ms
Run 5 of 10
Training (Embedding Generation) Time: 40380.98 ms
Prediction Time: 2.48 ms
Total Execution Time (Training + Prediction + Misc.): 40390.18 ms
Run 6 of 10
Training (Embedding Generation) Time: 39603.42 ms
Prediction Time: 2.06 ms
Total Execution Time (Training + Prediction + Misc.): 39614.12 ms
Run 7 of 10
Training (Embedding Generation) Time: 38297.19 ms
Prediction Time: 2.6

In [None]:
import shutil

# Replace 'your_directory_path' with the path to the directory you want to delete
shutil.rmtree('/content/builder')


In [None]:
import os
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors
import random
from sklearn.decomposition import PCA  # Optional for dimensionality reduction

# Function to calculate standard deviation
def calculate_std(values):
    return np.std(values)

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your Java programs from a directory
java_code_dir = "/content/builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to GPU
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move result back to CPU
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

# Separate positive and negative class files
positive_files = [file for file in java_files if "builder" in file.lower()]  # Positive class contains 'builder'
negative_files = [file for file in java_files if "builder" not in file.lower()]  # Negative class

# Running the experiment multiple times to calculate mean and standard deviation
n_runs = 50  # Increase the number of iterations to reduce variance
precision_values = []
recall_values = []
f1_values = []

for run in range(n_runs):
    print(f"Run {run + 1} of {n_runs}")

    # Randomly sample the same number of positive and negative examples
    num_positive = len(positive_files)
    sampled_negative_files = random.sample(negative_files, num_positive)

    sampled_files = positive_files + sampled_negative_files
    random.shuffle(sampled_files)  # Shuffle to avoid any ordering bias

    program_embeddings = []
    true_labels = []

    # Measure the total execution time
    start_time = time.time()

    # Measure the training time (embedding generation)
    start_training_time = time.time()

    for file in sampled_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
            java_code = f.read()

            # Get the embedding of the program by taking the mean of line embeddings
            program_embedding = get_line_embeddings(java_code)
            program_embeddings.append(program_embedding)

            # Define true labels (1 for positive, 0 for negative)
            true_labels.append(1 if "builder" in file.lower() else 0)

    # Convert program_embeddings to a NumPy array
    program_embeddings = np.array(program_embeddings)

    # Optional: Apply PCA to reduce the dimensionality of the embeddings
    pca = PCA(n_components=50)  # Reduce to 50 dimensions (adjust based on experiments)
    program_embeddings = pca.fit_transform(program_embeddings)

    # Measure the end of training time
    end_training_time = time.time()
    training_time_ms = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    print(f"Training (Embedding Generation) Time: {training_time_ms:.2f} ms")

    # Measure the prediction time
    prediction_start_time = time.time()

    # Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
    k = 5  # Adjust this value as needed
    neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
    neighbors.fit(program_embeddings)
    _, indices = neighbors.kneighbors(program_embeddings)

    # Initialize arrays to store actual and predicted labels
    actual_labels = np.array(true_labels)
    predicted_labels = np.zeros_like(actual_labels)

    # Predict labels for each program
    for i in range(len(sampled_files)):
        # Get the indices of the k-nearest neighbors (excluding the program itself)
        neighbor_indices = indices[i, 1:]

        # Get the labels of the neighbors
        neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

        # Assign the majority label to the program
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels[i] = predicted_label

    # Measure the end of prediction time
    prediction_end_time = time.time()
    prediction_time_ms = (prediction_end_time - prediction_start_time) * 1000  # Convert to milliseconds
    print(f"Prediction Time: {prediction_time_ms:.2f} ms")

    # Calculate precision, recall, and F1 score for this run
    precision = precision_score(actual_labels, predicted_labels, average='weighted')
    recall = recall_score(actual_labels, predicted_labels, average='weighted')
    f1 = f1_score(actual_labels, predicted_labels, average='weighted')

    precision_values.append(precision)
    recall_values.append(recall)
    f1_values.append(f1)

    # Measure total execution time (from the start of the script to the end of prediction)
    overall_end_time = time.time()
    execution_time_ms = (overall_end_time - start_time) * 1000  # Convert to milliseconds
    print(f"Total Execution Time (Training + Prediction + Misc.): {execution_time_ms:.2f} ms")

# Calculate mean and standard deviation for precision, recall, and F1 score
precision_mean = np.mean(precision_values)
recall_mean = np.mean(recall_values)
f1_mean = np.mean(f1_values)

precision_std = calculate_std(precision_values)
recall_std = calculate_std(recall_values)
f1_std = calculate_std(f1_values)

# Print results
print(f"\nMean Precision: {precision_mean:.2f} (±{precision_std:.2f})")
print(f"Mean Recall: {recall_mean:.2f} (±{recall_std:.2f})")
print(f"Mean F1 Score: {f1_mean:.2f} (±{f1_std:.2f})")


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



Run 1 of 50


ValueError: n_components=50 must be between 0 and min(n_samples, n_features)=18 with svd_solver='full'