In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and F1 score
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


In [1]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 0
File: nons (25).java, Actual Label: 0, Predicted Label: 1
File: nons (18).java, Actual Label: 0, Predicted Label: 0
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (55).java, Actual Label: 0, Predicted Label: 0
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 0
File: nons (68).java, Actual Label: 0, Predicted Label: 0
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: nons (30).java, Actual Label: 0, Predi

In [None]:
Singleton with different settings

In [1]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 0
File: nons (25).java, Actual Label: 0, Predicted Label: 1
File: nons (18).java, Actual Label: 0, Predicted Label: 0
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (55).java, Actual Label: 0, Predicted Label: 0
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 0
File: nons (68).java, Actual Label: 0, Predicted Label: 0
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: nons (30).java, Actual Label: 0, Predi

In [None]:
Singleton with different settings

In [2]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (50).java, Actual Label: 0, Predicted Label: 1
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: singleton (24).java, Actual Label: 1, Predicted Label: 1
File: nons (4).java, Actual Label: 0, Predicted Label: 0
File: singleton (9).java, Actual Label: 1, Predicted Label: 1
File: nons (44).java, Actual Label: 0, Predicted Label: 0
File: nons (38).java, Actual Label: 0, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (41).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: singleton (3).java, Actual Label: 1, Predicted Label: 1
File: singleton (23).java, Actual Label: 1, Predicted Label: 1
File: singleton (16).java, Actual Label: 1, Predicted Label: 1
File: singleton (13).java, Actual Label: 1, Predicted Label: 1
File: singleton (11).

In [None]:
Singleton with different settings

In [3]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 0
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 0
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 1
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (33).java, Actual Label: 0, Predicted Label: 0
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: nons (30).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: singleton (3).java, Actual Label: 1, Predicted Label: 0
File: nons (53).java, Actual Label: 0, Predicted Label: 0
File: singleton (23).java, Actual La

In [1]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "singleton"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "singleton" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nons (54).java, Actual Label: 0, Predicted Label: 1
File: singleton (25).java, Actual Label: 1, Predicted Label: 0
File: nons (27).java, Actual Label: 0, Predicted Label: 1
File: singleton (18).java, Actual Label: 1, Predicted Label: 1
File: nons (19).java, Actual Label: 0, Predicted Label: 1
File: nons (34).java, Actual Label: 0, Predicted Label: 0
File: singleton (24).java, Actual Label: 1, Predicted Label: 1
File: singleton (9).java, Actual Label: 1, Predicted Label: 0
File: singleton (10).java, Actual Label: 1, Predicted Label: 1
File: singleton (14).java, Actual Label: 1, Predicted Label: 1
File: nons (3).java, Actual Label: 0, Predicted Label: 0
File: singleton (1).java, Actual Label: 1, Predicted Label: 0
File: singleton (3).java, Actual Label: 1, Predicted Label: 0
File: nons (53).java, Actual Label: 0, Predicted Label: 0
File: singleton (23).java, Actual Label: 1, Predicted Label: 0
File: nons (17).java, Actual Label: 0, Predicted Label: 0
File: singleton (16).java, Actu

In [None]:
Builder with different settings

In [4]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonb (14).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (41).java, Actual Label: 0, Predicted Label: 0
File: nonb (5).java, Actual Label: 0, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (110).java, Actual Label: 0, Predicted Label: 0
File: nonb (18).java, Actual Label: 0, Predicted Label: 0
File: nonb (17).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: nonb (85).java, Actual Label: 0, Predicted Label: 1
File: builder (7).java, Actual Label: 1, Predicted Label: 1
File: nonb (92).java, Actual Label: 0, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (13).java, Actual Label: 0, Predicted Label: 0
File: builder (8).java, Actual Label: 1, Predicted Label: 1
File: nonb (23).java, Actual Label: 0, Predicted Label: 0


In [None]:
#Builder with different settings

In [5]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonb (14).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (110).java, Actual Label: 0, Predicted Label: 0
File: nonb (18).java, Actual Label: 0, Predicted Label: 0
File: nonb (17).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: builder (7).java, Actual Label: 1, Predicted Label: 0
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (13).java, Actual Label: 0, Predicted Label: 0
File: nonb (119).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label: 0
File: nonb (23).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Label: 0
File: nonb (93).java, Actual Label: 0, Predicted Lab

In [None]:
#Builder with different settings

In [6]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (89).java, Actual Label: 0, Predicted Label: 0
File: nonb (49).java, Actual Label: 0, Predicted Label: 0
File: nonb (28).java, Actual Label: 0, Predicted Label: 0
File: nonb (36).java, Actual Label: 0, Predicted Label: 0
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: nonb (126).java, Actual Label: 0, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (44).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: nonb (120).java, Actual Label: 0, Predicted Label: 0
File: nonb (69).java, Actual Label: 0, Predicted Label: 0
File: builder (7).java, Actual Label: 1, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (129).java, Actual Label: 0, Predicted Label: 0
File: nonb (114).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label

In [None]:
#Builder with different settings

In [2]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonb (14).java, Actual Label: 0, Predicted Label: 0
File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 0
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (110).java, Actual Label: 0, Predicted Label: 0
File: nonb (18).java, Actual Label: 0, Predicted Label: 0
File: nonb (17).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: builder (7).java, Actual Label: 1, Predicted Label: 0
File: nonb (92).java, Actual Label: 0, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (13).java, Actual Label: 0, Predicted Label: 0
File: nonb (119).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label: 0
File: nonb (23).java, Actual Label: 0, Predicted Label: 0
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Lab

In [None]:
#Builder with different settings

In [4]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "builder"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "builder" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: builder (2).java, Actual Label: 1, Predicted Label: 1
File: nonb (50).java, Actual Label: 0, Predicted Label: 0
File: nonb (36).java, Actual Label: 0, Predicted Label: 1
File: builder (5).java, Actual Label: 1, Predicted Label: 1
File: builder (6).java, Actual Label: 1, Predicted Label: 1
File: nonb (9).java, Actual Label: 0, Predicted Label: 0
File: builder (3).java, Actual Label: 1, Predicted Label: 1
File: nonb (87).java, Actual Label: 0, Predicted Label: 0
File: nonb (29).java, Actual Label: 0, Predicted Label: 0
File: builder (7).java, Actual Label: 1, Predicted Label: 1
File: builder (9).java, Actual Label: 1, Predicted Label: 1
File: nonb (123).java, Actual Label: 0, Predicted Label: 0
File: nonb (129).java, Actual Label: 0, Predicted Label: 0
File: nonb (114).java, Actual Label: 0, Predicted Label: 1
File: builder (8).java, Actual Label: 1, Predicted Label: 1
File: builder (1).java, Actual Label: 1, Predicted Label: 0
File: builder (4).java, Actual Label: 1, Predicted Lab

In [None]:
#abstract factory with different settings

In [4]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (34).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (10).java, Actual Label: 0, Predicted Label: 0
File: nonfm (37).java, Actual Label: 0, Predicted Label: 0
File: nonfm (13).java, Actual Label: 0, Predicted Label: 1
File: nonfm (17).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 0
File: nonfm (49).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: nonfm (7).java, Actual L

In [None]:
#abstract factory with different settings

In [5]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonab (22).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (19).java, Actual Label: 0, Predicted Label: 1
File: nonab (30).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: nonab (70).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (2).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: nonab (3).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: nonab (14).java, Actual La

In [None]:
#abstract factory with different settings

In [6]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (5).java, Actual Label: 0, Predicted Label: 0
File: nonab (24).java, Actual Label: 0, Predicted Label: 0
File: nonab (65).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: nonab (10).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: nonab (33).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (86).java,

In [None]:
#abstract factory with different settings

In [1]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonab (11).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (5).java, Actual Label: 0, Predicted Label: 0
File: nonab (7).java, Actual Label: 0, Predicted Label: 0
File: nonab (15).java, Actual Label: 0, Predicted Label: 0
File: nonab (1).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: nonab (10).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 1
File: nonab (2).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: nonab (8).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: nonab (3).java, Actual Label: 0, Predicted Label

In [None]:
#abstract factory with different settings

In [2]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "abstractfactory"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "abstractfactory" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonab (11).java, Actual Label: 0, Predicted Label: 0
File: nonab (22).java, Actual Label: 0, Predicted Label: 1
File: abstractfactory (2).java, Actual Label: 1, Predicted Label: 1
File: nonab (28).java, Actual Label: 0, Predicted Label: 0
File: nonab (65).java, Actual Label: 0, Predicted Label: 1
File: nonab (15).java, Actual Label: 0, Predicted Label: 0
File: abstractfactory (10).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (9).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (13).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (16).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (4).java, Actual Label: 1, Predicted Label: 0
File: abstractfactory (5).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (1).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (12).java, Actual Label: 1, Predicted Label: 1
File: abstractfactory (3).java, Actual Label: 1, Predicted Label: 1
File: nonab (32).java

In [None]:
#Prototype with different settings

In [3]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: nonp (51).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: nonp (29).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (37).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: nonp (22).java, Actual Label: 0, 

In [None]:
#Prototype with different settings

In [4]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (59).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 1
File: nonp (58).java, Actual Label: 0, Predicted Label: 1
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: nonp (22).java, Actual Label: 0, Predicted Label: 1
File: prototype (32).java, Actual Label: 1, Predicted Label: 1
File: prototype (6).java, Actual La

In [None]:
#Prototype with different settings

In [5]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (59).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 1
File: nonp (58).java, Actual Label: 0, Predicted Label: 1
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: nonp (22).java, Actual Label: 0, Predicted Label: 1
File: prototype (32).java, Actual Label: 1, Predicted Label: 1
File: prototype (6).java, Actual La

In [1]:
#Prototype with different settings

In [2]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (59).java, Actual Label: 0, Predicted Label: 0
File: nonp (31).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (44).java, Actual Label: 0, Predicted Label: 0
File: nonp (37).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: nonp (36).java, Actual Label: 0, Predicted Label: 0
File: nonp (58).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted Label: 0
File: nonp (28).java, Actual Label: 0, Predicted Label: 0
File: prototype (16).java, Actual Label: 1, Predicted Label: 1
File: prototype (18).java, Actual Label: 1, Predicted Label: 0
File: prototype (32).java, Actual Lab

In [3]:
#Prototype with different settings

In [5]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (23).java, Actual Label: 0, Predicted Label: 0
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: nonp (51).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (44).java, Actual Label: 0, Predicted Label: 0
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: nonp (36).java, Actual Label: 0, Predicted Label: 0
File: nonp (58).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted L

In [4]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "prototype"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "prototype" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: prototype (27).java, Actual Label: 1, Predicted Label: 1
File: nonp (23).java, Actual Label: 0, Predicted Label: 0
File: nonp (43).java, Actual Label: 0, Predicted Label: 0
File: nonp (35).java, Actual Label: 0, Predicted Label: 0
File: nonp (51).java, Actual Label: 0, Predicted Label: 0
File: nonp (5).java, Actual Label: 0, Predicted Label: 0
File: prototype (13).java, Actual Label: 1, Predicted Label: 1
File: prototype (22).java, Actual Label: 1, Predicted Label: 0
File: nonp (7).java, Actual Label: 0, Predicted Label: 0
File: nonp (13).java, Actual Label: 0, Predicted Label: 0
File: prototype (15).java, Actual Label: 1, Predicted Label: 1
File: nonp (44).java, Actual Label: 0, Predicted Label: 0
File: nonp (8).java, Actual Label: 0, Predicted Label: 0
File: nonp (21).java, Actual Label: 0, Predicted Label: 0
File: nonp (36).java, Actual Label: 0, Predicted Label: 0
File: nonp (58).java, Actual Label: 0, Predicted Label: 0
File: prototype (14).java, Actual Label: 1, Predicted L

In [None]:
#Factory Method with different settings

In [1]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (13).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 0
File: nonfm (12).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (18).java, Actual Label: 0, Predicted Label: 0
File: nonfm (8).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: nonfm (11).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (14).java, Actual Label: 0, Predicted Label: 0
File: nonfm (17).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 0
File: nonfm (9).java, Actual Label: 0, Predicted Label: 1
F

In [None]:
#Factory method with different settings

In [2]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (80).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (78).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (82).java, Actual Label: 0, Predicted Label: 1
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: nonfm (81).java, Actual Label: 0, Predicted Label: 1

In [3]:
#Factory Method with different settings

In [4]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (6).java, Actual Label: 0, Predicted Label: 0
File: nonfm (5).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (4).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: nonfm (2).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (1).java, Actual Label: 0, Predicted Label: 0
File: nonfm (8).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (11).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (8).java, Actual Label: 1, Predicted

In [5]:
#Factory method with different settingsactual_labels

In [6]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (80).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (78).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (11).java, Actual Label: 1, Predicted 

In [6]:
#Factory method with different settings

In [7]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (68).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (70).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (82).java, Actual Label: 0, Predicted Label: 1
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1
File: nonfm (81).java, Actual Label: 0, Predicted Label: 1

In [8]:
#Factory Method with different settings

In [9]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import NearestNeighbors

# Load your Java programs from a directory
java_code_dir = "factorymethod"
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []
true_labels = []

def get_line_embeddings(code):
    lines = code.split('\n')
    line_embeddings = []
    for line in lines:
        if line.strip():  # Check if the line is not empty
            inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            line_embeddings.append(embeddings)
    return np.mean(line_embeddings, axis=0)

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="ISO-8859-1") as f:
        java_code = f.read()

        # Get the embedding of the program by taking the mean of line embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

        # Define true labels based on the file names
        true_labels.append(1 if "factorymethod" in file else 0)

# Convert program_embeddings to a NumPy array
program_embeddings = np.array(program_embeddings)

# Use Nearest Neighbors to find k-nearest neighbors with Euclidean distance
k = 5  # Adjust this value as needed
neighbors = NearestNeighbors(n_neighbors=k, metric='euclidean')
neighbors.fit(program_embeddings)
_, indices = neighbors.kneighbors(program_embeddings)

# Initialize arrays to store actual and predicted labels
actual_labels = np.array(true_labels)
predicted_labels = np.zeros_like(actual_labels)

# Predict labels for each program
for i in range(len(java_files)):
    # Get the indices of the k-nearest neighbors (excluding the program itself)
    neighbor_indices = indices[i, 1:]

    # Get the labels of the neighbors
    neighbor_labels = [true_labels[idx] for idx in neighbor_indices]

    # Assign the majority label to the program
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels[i] = predicted_label

# Calculate precision, recall, and F1 score
precision = precision_score(actual_labels, predicted_labels, average='weighted')
recall = recall_score(actual_labels, predicted_labels, average='weighted')
f1 = f1_score(actual_labels, predicted_labels, average='weighted')

# Print actual and predicted labels for each program
for i, file in enumerate(java_files):
    print(f"File: {file}, Actual Label: {actual_labels[i]}, Predicted Label: {predicted_labels[i]}")

print(f"\nPrecision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


File: nonfm (77).java, Actual Label: 0, Predicted Label: 0
File: nonfm (80).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (1).java, Actual Label: 1, Predicted Label: 0
File: factorymethod (9).java, Actual Label: 1, Predicted Label: 1
File: nonfm (79).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (5).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (2).java, Actual Label: 1, Predicted Label: 1
File: nonfm (75).java, Actual Label: 0, Predicted Label: 0
File: nonfm (72).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (10).java, Actual Label: 1, Predicted Label: 1
File: factorymethod (7).java, Actual Label: 1, Predicted Label: 0
File: nonfm (78).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (4).java, Actual Label: 1, Predicted Label: 1
File: nonfm (82).java, Actual Label: 0, Predicted Label: 1
File: nonfm (74).java, Actual Label: 0, Predicted Label: 0
File: factorymethod (3).java, Actual Label: 1, Predicted Label: 1

In [3]:
#To plot t-SNE

In [4]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 KB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.0
You should consider upgrading via the '/apps/Arch/software/Python/3.10.4-GCCcore-11.3.0/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [28]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to get embeddings for a given design pattern
def get_embeddings_for_pattern(pattern, model, tokenizer):
    directory = os.path.join("all_design_patterns", pattern.lower())
    files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

    embeddings = []
    true_labels = []

    for file in files:
        with open(os.path.join(directory, file), "r", encoding="ISO-8859-1") as f:
            code = f.read()

        # Tokenize and encode the Java program
        inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        program_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        embeddings.append(program_embedding)
        true_labels.append(pattern)

    return np.array(embeddings), np.array(true_labels)

# Load the CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get embeddings for each design pattern
patterns = ["Singleton", "Prototype", "AbstractFactory", "Builder", "FactoryMethod"]
all_embeddings = []
all_labels = []

# Custom color palette for each design pattern with higher contrast
color_palette = ["red", "green", "orange", "blue", "purple"]

# Custom markers for each design pattern
markers = ["o", "s", "D", "^", "P"]

for i, pattern in enumerate(patterns):
    pattern_embeddings, pattern_labels = get_embeddings_for_pattern(pattern, model, tokenizer)
    all_embeddings.append(pattern_embeddings)
    all_labels.append(pattern_labels)

# Concatenate the embeddings and labels
all_embeddings = np.concatenate(all_embeddings, axis=0)
all_labels = np.concatenate(all_labels)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(all_embeddings)

# Create a scatter plot for t-SNE visualization with custom symbols
plt.figure(figsize=(20, 16))

for i, pattern in enumerate(patterns):
    indices = all_labels == pattern
    sns.scatterplot(x=tsne_results[indices, 0], y=tsne_results[indices, 1], marker=markers[i], color=color_palette[i], s=200, label=pattern)

# Increase font sizes for better visibility
plt.title('t-SNE Visualization for CodeBERT on Different Design Patterns', fontsize=30)
plt.xlabel('t-SNE Dimension 1', fontsize=25)
plt.ylabel('t-SNE Dimension 2', fontsize=25)
plt.legend(title='Design Pattern', loc='upper right', fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Save the t-SNE plot as a PDF file
plt.savefig('tsne_plot_updated_symbols_colors.pdf', format='pdf')
plt.show()
