In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="utf-8") as f:
        java_code = f.read()

        # Determine if it's a positive class ("prototype") or negative class
        if "singleton" in file:
            label = 1
        else:
            label = 0

        true_labels.append(label)

        # Get mean embedding for each line and store in program_embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf8 in position 2945: invalid start byte

In [None]:
Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (42).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (68).java, Predicted Label: 

In [None]:
Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (42).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (68).java, Predicted Label: 

In [None]:
Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (42).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (68).java, Predicted Label: 

In [None]:
Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 1, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (68).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (3).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Pred

In [None]:
#Singleton with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (41).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 1, True Label: 0
File: singleton (1).java, 

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (41).java, Predicted Label: 0, True Label: 0
File: nonb (115).java, Predicted Label: 0, True Label: 0
File: nonb (49).java, Predicted Label: 0, True Label: 0
File: nonb (50).java, Predicted Label: 0, True Label: 0
File: nonb (109).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (61).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (53).java, Predicted Label: 1, True Label: 0
File: nonb (51).java, Predicted Label: 0, True Label: 0
File: nonb (46).java, Predicted Label: 0, True Label: 0
File: nonb (58).java, Predicted Label: 0, True Label: 0
File: nonb (82).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (73).java, Predicted Label: 0, 

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (115).java, Predicted Label: 0, True Label: 0
File: nonb (89).java, Predicted Label: 0, True Label: 0
File: nonb (3).java, Predicted Label: 0, True Label: 0
File: nonb (49).java, Predicted Label: 0, True Label: 0
File: nonb (28).java, Predicted Label: 0, True Label: 0
File: nonb (50).java, Predicted Label: 0, True Label: 0
File: nonb (103).java, Predicted Label: 0, True Label: 0
File: nonb (36).java, Predicted Label: 1, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: nonb (126).java, Predicted Label: 0, True Label: 0
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (44).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 1, True Label: 1
File: nonb (87).java, Predicted Label: 0, True Label: 0
File: nonb (120).java, Predicted Label: 0, True Label: 0
File: nonb (69).java, Predicted Label: 0, T

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (98).java, Predicted Label: 0, True Label: 0
File: nonb (26).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: nonb (100).java, Predicted Label: 0, True Label: 0
File: nonb (128).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (19).java, Predicted Label: 0, True Label: 0
File: nonb (86).java, Predicted Label: 0, True Label: 0
File: nonb (85).java, Predicted Label: 0, True Label: 0
File: nonb (81).java, Predicted Label: 0, True Label: 0
File: nonb (82).java, Predicted Label: 0, True Label: 0
File: nonb (57).java, Predicted Label: 0, Tr

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: nonb (77).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (26).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (61).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: nonb (128).java, Predicted Label: 1, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (53).java, Predicted Label: 0, True Label: 0
File: nonb (19).java, Predicted Label: 0, True Label: 0
File: nonb (46).java, Predicted Label: 1, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: builder (9).java, Predicted Label: 1

In [None]:
#Builder with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (41).java, Predicted Label: 1, True Label: 0
File: nonb (5).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 1, True Label: 1
File: nonb (85).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (92).java, Predicted Label: 0, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (13).java, Predicted Label: 0, True Label: 0
File: builder (8).java, Predicted Label: 1, True Label: 1
File: nonb (23).java, Predicted Label: 0

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (52).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: nonfm (68).java, Predicted Label: 0, True Label: 0
File: nonfm (37).java, Predicted Label: 0, True Label: 0
File: nonfm (29).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 0, True Label: 1
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: nonfm (49).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 1, True Label:

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (6).java, Predicted Label: 0, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (12).java, Predicted Label: 0, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (18).java, Predicted Label: 0, True Label: 0
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
F

In [None]:
#Factory method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (77).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (75).java, Predicted Label: 0, True Label: 0
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (78).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 1, True Label: 1
File: nonfm (82).java, Predicted Label: 0, True Label: 0
File: nonfm (74).java, Predicted Label: 0, True Label: 0
File: factorymethod (3).java, Predicted Label

In [None]:
#Factory method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (68).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (65).java, Predicted Label: 0, True Label: 0
File: nonfm (63).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (40).java, Predicted Label: 1, True Label: 0
File: nonfm (47).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: factorymethod (4).java, Predicted Label: 0, True Label: 1
File: nonfm (43).java, Predicted Label: 0, True Label: 0
File: factorymethod (3).java, Predicted Label: 1, True Label: 1
File: nonfm (53).java, Predicted Label

In [None]:
#Factory Method with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (77).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (75).java, Predicted Label: 0, True Label: 0
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (70).java, Predicted Label: 0, True Label: 0
File: nonfm (78).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 1, True Label: 1
File: nonfm (82).java, Predicted Label: 0, True Label: 0
File: nonfm (74).java, Predicted Label: 0, Tr

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nondp (7).java, Predicted Label: 0, True Label: 0
File: nondp (2).java, Predicted Label: 0, True Label: 0
File: nondp (18).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (33).java, Predicted Label: 0, True Label: 0
File: nondp (29).java, Predicted Label: 0, True Label: 0
File: nondp (31).java, Predicted Label: 0, True Label: 0
File: nondp (11).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 0, True Label: 0
File: nondp (45).java, Predicted Label: 0, True Label: 0
File: nondp (19).java, Predicted Label: 0, True Label: 0
File: nondp (23).java, Predicted Label: 0, True Label: 0
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 1, True Label: 0
File: nondp (26).java, Predicted Label: 0, True Label: 0
File: nondp (6).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java,

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (31).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: nonfm (30).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonfm (20).java, Predicted Label: 1, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 0, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonfm (29).java, Predicted Label: 1, True Label: 0
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
Fi

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (30).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab (14).java, Predicted Label: 0, True Label: 0
File: nonab (80).java, Predicted Label: 0, True Label: 0
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
Fi

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (33).java, Predicted Label: 1, True Label: 0
File: nondp (31).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 1, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 1, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 1, True Label: 0
File: nonab (56).java, Predicted Label: 0, True Label: 0
File: nonab (15).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: nonab (51).java, Predicted Label: 0, True Label: 0
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (26).java, Predicted

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (72).java, Predicted Label: 0, True Label: 0
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: nonab (1).java, Predicted Label: 1, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Pr

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 1, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: non-DP (44).java, Predicted Label: 0, True Label: 0
File: non-DP (34).java, Predicted Label: 0, True Label: 0
File: non-DP (29).java, Predicted Label: 0, True Label: 0
File: non-DP (35).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: non-DP (48).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: non-DP (25).java, Predicted Label: 0, True Label: 0
File: non-DP (45).java, Predicted Label: 0, True Label: 0
File: non-DP (39).java, Predicted Label: 0, True Label: 0
File: non-DP (2).java, Predicted Label: 0, True Label: 0
File: non-DP (33).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 0, True Label: 1
File: prototype (16).java, Predicted Label: 1, True Label: 1
File:

In [None]:
#Prototype with different settings

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 1, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label

In [None]:
#tNSE plot for design patterns

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code, model, tokenizer):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Function to perform t-SNE visualization
def perform_tsne(embeddings, labels):
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Create a scatter plot for t-SNE visualization
    plt.figure(figsize=(10, 8))

    sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=labels, palette="dark", s=50, alpha=0.7)

    plt.title('t-SNE Visualization for CodeT5 on Different Classes', fontsize=16)
    plt.xlabel('t-SNE Dimension 1', fontsize=14)
    plt.ylabel('t-SNE Dimension 2', fontsize=14)
    plt.legend(title='Class', loc='upper right', fontsize=12)
    plt.grid(True)

    plt.show()

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code, model, tokenizer)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for t-SNE
flattened_embeddings = np.vstack(program_embeddings)

# Perform t-SNE visualization
perform_tsne(flattened_embeddings, true_labels)


In [None]:
#Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code, model, tokenizer):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Function to perform t-SNE visualization
def perform_tsne(embeddings, labels):
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Create a scatter plot for t-SNE visualization
    plt.figure(figsize=(10, 8))

    sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=labels, palette="dark", s=50, alpha=0.7)

    plt.title('t-SNE Visualization for CodeT5 on Different Classes', fontsize=16)
    plt.xlabel('t-SNE Dimension 1', fontsize=14)
    plt.ylabel('t-SNE Dimension 2', fontsize=14)
    plt.legend(title='Class', loc='upper right', fontsize=12)
    plt.grid(True)

    plt.show()

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code, model, tokenizer)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for t-SNE
flattened_embeddings = np.vstack(program_embeddings)

# Perform t-SNE visualization
perform_tsne(flattened_embeddings, true_labels)


**Time calcultation for Singleton**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]



Run 1/10
Precision: 0.33, Recall: 0.33, F1 Score: 0.33
Run 2/10
Precision: 0.55, Recall: 0.60, F1 Score: 0.57
Run 3/10
Precision: 0.38, Recall: 0.45, F1 Score: 0.42
Run 4/10
Precision: 0.33, Recall: 0.30, F1 Score: 0.32
Run 5/10
Precision: 0.47, Recall: 0.80, F1 Score: 0.59
Run 6/10
Precision: 0.33, Recall: 0.45, F1 Score: 0.38
Run 7/10
Precision: 0.67, Recall: 0.55, F1 Score: 0.60
Run 8/10
Precision: 0.33, Recall: 0.29, F1 Score: 0.31
Run 9/10
Precision: 0.20, Recall: 0.13, F1 Score: 0.16
Run 10/10
Precision: 0.27, Recall: 0.36, F1 Score: 0.31

Mean Training Time: 55256.77 ms
Mean Prediction Time: 0.09 ms


In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.55, Recall: 0.40, F1 Score: 0.46
Run 2/10
Precision: 0.17, Recall: 0.10, F1 Score: 0.12
Run 3/10
Precision: 0.60, Recall: 0.60, F1 Score: 0.60
Run 4/10
Precision: 0.59, Recall: 0.77, F1 Score: 0.67
Run 5/10
Precision: 0.55, Recall: 0.46, F1 Score: 0.50
Run 6/10
Precision: 0.50, Recall: 0.60, F1 Score: 0.55
Run 7/10
Precision: 0.42, Recall: 0.50, F1 Score: 0.45
Run 8/10
Precision: 0.31, Recall: 0.38, F1 Score: 0.34
Run 9/10
Precision: 0.64, Recall: 0.50, F1 Score: 0.56
Run 10/10
Precision: 0.37, Recall: 0.50, F1 Score: 0.42

Mean Training Time: 52697.69 ms
Mean Prediction Time: 0.10 ms


In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.35, Recall: 0.46, F1 Score: 0.40
Run 2/10
Precision: 0.60, Recall: 0.69, F1 Score: 0.64
Run 3/10
Precision: 0.53, Recall: 0.67, F1 Score: 0.59
Run 4/10
Precision: 0.43, Recall: 0.46, F1 Score: 0.44
Run 5/10
Precision: 0.56, Recall: 0.50, F1 Score: 0.53
Run 6/10
Precision: 0.37, Recall: 0.47, F1 Score: 0.41
Run 7/10
Precision: 0.46, Recall: 0.50, F1 Score: 0.48
Run 8/10
Precision: 0.45, Recall: 0.42, F1 Score: 0.43
Run 9/10
Precision: 0.27, Recall: 0.25, F1 Score: 0.26
Run 10/10
Precision: 0.43, Recall: 0.60, F1 Score: 0.50

Mean Training Time: 56524.08 ms
Mean Prediction Time: 0.11 ms


**Calculation for prototype**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.70, Recall: 0.47, F1 Score: 0.56
Run 2/10
Precision: 0.40, Recall: 0.27, F1 Score: 0.32
Run 3/10
Precision: 1.00, Recall: 0.60, F1 Score: 0.75
Run 4/10
Precision: 0.53, Recall: 0.67, F1 Score: 0.59
Run 5/10
Precision: 0.36, Recall: 0.50, F1 Score: 0.42
Run 6/10
Precision: 0.44, Recall: 0.50, F1 Score: 0.47
Run 7/10
Precision: 0.60, Recall: 0.60, F1 Score: 0.60
Run 8/10
Precision: 0.46, Recall: 0.60, F1 Score: 0.52
Run 9/10
Precision: 0.50, Recall: 0.58, F1 Score: 0.54
Run 10/10
Precision: 0.67, Recall: 0.67, F1 Score: 0.67

Mean Training Time: 62751.02 ms
Mean Prediction Time: 0.12 ms


**Builder time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "builder" in file]
    negative_files = [file for file in java_files if "builder" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 2/10
Precision: 0.50, Recall: 0.50, F1 Score: 0.50
Run 3/10
Precision: 0.20, Recall: 0.25, F1 Score: 0.22
Run 4/10
Precision: 0.25, Recall: 0.33, F1 Score: 0.29
Run 5/10
Precision: 0.17, Recall: 0.20, F1 Score: 0.18
Run 6/10
Precision: 0.20, Recall: 0.25, F1 Score: 0.22
Run 7/10
Precision: 0.29, Recall: 0.40, F1 Score: 0.33
Run 8/10
Precision: 0.75, Recall: 1.00, F1 Score: 0.86
Run 9/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 10/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00

Mean Training Time: 27341.18 ms
Mean Prediction Time: 0.04 ms


In [None]:
import shutil

# Specify the directory you want to delete
directory_to_delete = '/content/builder'  # Change this to your directory name

# Remove the directory
shutil.rmtree(directory_to_delete)


**Builder time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "builder" in file]
    negative_files = [file for file in java_files if "builder" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.33, Recall: 0.20, F1 Score: 0.25
Run 2/10
Precision: 1.00, Recall: 0.67, F1 Score: 0.80
Run 3/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 4/10
Precision: 0.33, Recall: 0.25, F1 Score: 0.29
Run 5/10
Precision: 0.67, Recall: 0.50, F1 Score: 0.57
Run 6/10
Precision: 0.67, Recall: 0.67, F1 Score: 0.67
Run 7/10
Precision: 0.50, Recall: 0.20, F1 Score: 0.29
Run 8/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 9/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 10/10
Precision: 0.25, Recall: 0.33, F1 Score: 0.29

Mean Training Time: 24367.04 ms
Mean Prediction Time: 0.04 ms


**Abstract Factory time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "abstractfactory" in file]
    negative_files = [file for file in java_files if "abstractfactory" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.50, Recall: 0.50, F1 Score: 0.50
Run 2/10
Precision: 0.44, Recall: 0.44, F1 Score: 0.44
Run 3/10
Precision: 0.71, Recall: 0.83, F1 Score: 0.77
Run 4/10
Precision: 0.67, Recall: 0.86, F1 Score: 0.75
Run 5/10
Precision: 0.78, Recall: 0.88, F1 Score: 0.82
Run 6/10
Precision: 0.50, Recall: 0.43, F1 Score: 0.46
Run 7/10
Precision: 0.60, Recall: 0.50, F1 Score: 0.55
Run 8/10
Precision: 0.43, Recall: 0.50, F1 Score: 0.46
Run 9/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 10/10
Precision: 0.17, Recall: 0.14, F1 Score: 0.15

Mean Training Time: 23333.28 ms
Mean Prediction Time: 0.06 ms


**Abstract Factory time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "abstractfactory" in file]
    negative_files = [file for file in java_files if "abstractfactory" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 1.00, Recall: 0.71, F1 Score: 0.83
Run 2/10
Precision: 0.50, Recall: 0.57, F1 Score: 0.53
Run 3/10
Precision: 0.60, Recall: 0.50, F1 Score: 0.55
Run 4/10
Precision: 0.45, Recall: 0.56, F1 Score: 0.50
Run 5/10
Precision: 0.71, Recall: 0.62, F1 Score: 0.67
Run 6/10
Precision: 0.62, Recall: 0.89, F1 Score: 0.73
Run 7/10
Precision: 0.50, Recall: 0.71, F1 Score: 0.59
Run 8/10
Precision: 0.67, Recall: 1.00, F1 Score: 0.80
Run 9/10
Precision: 0.71, Recall: 0.62, F1 Score: 0.67
Run 10/10
Precision: 0.29, Recall: 0.22, F1 Score: 0.25

Mean Training Time: 25789.83 ms
Mean Prediction Time: 0.07 ms


**Factory method time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "factorymethod" in file]
    negative_files = [file for file in java_files if "factorymethod" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.29, Recall: 0.40, F1 Score: 0.33
Run 2/10
Precision: 0.20, Recall: 0.20, F1 Score: 0.20
Run 3/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 4/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 5/10
Precision: 1.00, Recall: 0.25, F1 Score: 0.40
Run 6/10
Precision: 0.50, Recall: 0.25, F1 Score: 0.33
Run 7/10
Precision: 0.29, Recall: 0.33, F1 Score: 0.31
Run 8/10
Precision: 0.75, Recall: 0.60, F1 Score: 0.67
Run 9/10
Precision: 0.33, Recall: 0.20, F1 Score: 0.25
Run 10/10
Precision: 0.33, Recall: 0.25, F1 Score: 0.29

Mean Training Time: 24440.02 ms
Mean Prediction Time: 0.05 ms


**Factory method time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "factorymethod" in file]
    negative_files = [file for file in java_files if "factorymethod" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms")




Run 1/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 2/10
Precision: 0.50, Recall: 0.50, F1 Score: 0.50
Run 3/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 4/10
Precision: 0.00, Recall: 0.00, F1 Score: 0.00
Run 5/10
Precision: 0.43, Recall: 0.75, F1 Score: 0.55
Run 6/10
Precision: 0.40, Recall: 0.40, F1 Score: 0.40
Run 7/10
Precision: 0.40, Recall: 0.50, F1 Score: 0.44
Run 8/10
Precision: 0.86, Recall: 1.00, F1 Score: 0.92
Run 9/10
Precision: 0.25, Recall: 0.25, F1 Score: 0.25
Run 10/10
Precision: 0.33, Recall: 0.40, F1 Score: 0.36

Mean Training Time: 21695.20 ms
Mean Prediction Time: 0.05 ms


**Silhouette Score Davies-Bouldin Index calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the directory containing Java programs of different design patterns
java_code_dir = "/content/design_patterns"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Function to get design pattern label based on the file name
def get_design_pattern_label(file_name):
    if file_name.startswith("singleton"):
        return 0  # Singleton
    elif file_name.startswith("builder"):
        return 1  # Builder
    elif file_name.startswith("abstractfactory"):
        return 2  # Abstract Factory
    elif file_name.startswith("prototype"):
        return 3  # Prototype
    elif file_name.startswith("factorymethod"):
        return 4  # Factory Method
    else:
        return -1  # Unknown, but ideally should not happen

# Define the number of iterations for the experiment
n_runs = 10
training_times = []
prediction_times = []
silhouette_scores = []
db_scores = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Select files for each design pattern
    singleton_files = [file for file in java_files if file.startswith("singleton")]
    builder_files = [file for file in java_files if file.startswith("builder")]
    abstractfactory_files = [file for file in java_files if file.startswith("abstractfactory")]
    prototype_files = [file for file in java_files if file.startswith("prototype")]
    factorymethod_files = [file for file in java_files if file.startswith("factorymethod")]

    # Collect all files
    selected_files = singleton_files + builder_files + abstractfactory_files + prototype_files + factorymethod_files
    random.shuffle(selected_files)

    # Get true labels (design pattern labels) for the files
    true_labels = [get_design_pattern_label(file) for file in selected_files]
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Calculate clustering metrics: Silhouette Score and Davies-Bouldin Index
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'

    # Check if all labels are identical, which would make silhouette score invalid
    if len(set(true_labels)) > 1:
        silhouette = silhouette_score(program_embeddings, true_labels, metric=distance_metric)
        db_index = davies_bouldin_score(program_embeddings, true_labels)
    else:
        silhouette = np.nan  # Not applicable if all labels are the same
        db_index = np.nan  # Not applicable if all labels are the same

    silhouette_scores.append(silhouette)
    db_scores.append(db_index)

    print(f"Silhouette Score: {silhouette:.2f}, Davies-Bouldin Index: {db_index:.2f}")

# Calculate the mean training and prediction times
mean_training_time = np.mean(training_times)
mean_silhouette_score = np.nanmean(silhouette_scores)  # Handle NaN values
mean_db_score = np.nanmean(db_scores)  # Handle NaN values

print(f"\nMean Training Time: {mean_training_time:.2f} ms")
print(f"Mean Silhouette Score: {mean_silhouette_score:.2f}")
print(f"Mean Davies-Bouldin Index: {mean_db_score:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]



Run 1/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 2/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 3/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 4/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 5/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 6/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 7/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 8/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 9/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66
Run 10/10
Silhouette Score: 0.05, Davies-Bouldin Index: 2.66

Mean Training Time: 208390.44 ms
Mean Silhouette Score: 0.05
Mean Davies-Bouldin Index: 2.66


In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return None  # Return None if there are no valid embeddings

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []
precision_scores = []
recall_scores = []
f1_scores = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "abstractfactory" in file]
    negative_files = [file for file in java_files if "abstractfactory" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)

        if program_embedding is not None:
            program_embeddings.append(program_embedding)

    # Check if we have valid embeddings
    if len(program_embeddings) == 0:
        print(f"No valid embeddings for run {run + 1}. Skipping this run.")
        continue

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean and standard deviation for training and prediction times
mean_training_time = np.mean(training_times)
std_training_time = np.std(training_times)
mean_prediction_time = np.mean(prediction_times)
std_prediction_time = np.std(prediction_times)

# Calculate the mean and standard deviation for performance metrics
mean_precision = np.mean(precision_scores)
std_precision = np.std(precision_scores)
mean_recall = np.mean(recall_scores)
std_recall = np.std(recall_scores)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

print(f"\nMean Training Time: {mean_training_time:.2f} ms (± {std_training_time:.2f})")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms (± {std_prediction_time:.2f})")
print(f"Mean Precision: {mean_precision:.2f} (± {std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (± {std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (± {std_f1:.2f})")




Run 1/10
Precision: 0.50, Recall: 0.50, F1 Score: 0.50
Run 2/10
Precision: 0.70, Recall: 0.78, F1 Score: 0.74
Run 3/10
Precision: 0.62, Recall: 0.89, F1 Score: 0.73
Run 4/10
Precision: 0.44, Recall: 0.67, F1 Score: 0.53
Run 5/10
Precision: 0.57, Recall: 0.57, F1 Score: 0.57
Run 6/10
Precision: 0.50, Recall: 0.56, F1 Score: 0.53
Run 7/10
Precision: 0.57, Recall: 0.67, F1 Score: 0.62
Run 8/10
Precision: 0.17, Recall: 0.17, F1 Score: 0.17
Run 9/10
Precision: 0.50, Recall: 0.56, F1 Score: 0.53
Run 10/10
Precision: 0.38, Recall: 0.43, F1 Score: 0.40

Mean Training Time: 18424.65 ms (± 3127.17)
Mean Prediction Time: 0.05 ms (± 0.01)
Mean Precision: 0.49 (± 0.14)
Mean Recall: 0.58 (± 0.19)
Mean F1 Score: 0.53 (± 0.16)


In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/abstractfactory'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/abstractfactory' has been deleted.


In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return None  # Return None if there are no valid embeddings

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
training_times = []
prediction_times = []
precision_scores = []
recall_scores = []
f1_scores = []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples (40%-60% range)
    num_positive = len(positive_files)
    num_negative = len(negative_files)

    lower_bound = int(0.4 * min(num_positive, num_negative))
    upper_bound = int(0.6 * min(num_positive, num_negative))

    num_samples = random.randint(lower_bound, upper_bound)

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    random.shuffle(selected_files)

    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)
    program_embeddings = []

    # Start training time
    start_training_time = time.time()

    # Extract embeddings for each file
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)

        if program_embedding is not None:
            program_embeddings.append(program_embedding)

    # Check if we have valid embeddings
    if len(program_embeddings) == 0:
        print(f"No valid embeddings for run {run + 1}. Skipping this run.")
        continue

    # Convert to NumPy array
    program_embeddings = np.vstack(program_embeddings)

    # Calculate the distance matrix
    distance_metric = 'euclidean'  # Can switch between 'cosine' or 'euclidean'
    distance_matrix = calculate_distance_matrix(program_embeddings, metric=distance_metric)

    # Get k-nearest neighbors indices for each program
    knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

    # End training time
    end_training_time = time.time()
    training_time = (end_training_time - start_training_time) * 1000  # Convert to milliseconds
    training_times.append(training_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict labels based on neighbors
    predicted_labels = []
    for indices in knn_indices:
        neighbor_labels = [true_labels[i] for i in indices]
        predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
        predicted_labels.append(predicted_label)

    # End prediction time
    end_prediction_time = time.time()
    prediction_time = (end_prediction_time - start_prediction_time) * 1000  # Convert to milliseconds
    prediction_times.append(prediction_time)

    # Calculate precision, recall, and f1-score
    predicted_labels = np.array(predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate the mean and standard deviation for training and prediction times
mean_training_time = np.mean(training_times)
std_training_time = np.std(training_times)
mean_prediction_time = np.mean(prediction_times)
std_prediction_time = np.std(prediction_times)

# Calculate the mean and standard deviation for performance metrics
mean_precision = np.mean(precision_scores)
std_precision = np.std(precision_scores)
mean_recall = np.mean(recall_scores)
std_recall = np.std(recall_scores)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

print(f"\nMean Training Time: {mean_training_time:.2f} ms (± {std_training_time:.2f})")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} ms (± {std_prediction_time:.2f})")
print(f"Mean Precision: {mean_precision:.2f} (± {std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (± {std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (± {std_f1:.2f})")




Run 1/10
Precision: 0.42, Recall: 0.36, F1 Score: 0.38
Run 2/10
Precision: 0.50, Recall: 0.47, F1 Score: 0.48
Run 3/10
Precision: 0.50, Recall: 0.64, F1 Score: 0.56
Run 4/10
Precision: 0.27, Recall: 0.27, F1 Score: 0.27
Run 5/10
Precision: 0.33, Recall: 0.27, F1 Score: 0.30
Run 6/10
Precision: 0.64, Recall: 0.69, F1 Score: 0.67
Run 7/10
Precision: 0.39, Recall: 0.54, F1 Score: 0.45
Run 8/10
Precision: 0.46, Recall: 0.40, F1 Score: 0.43
Run 9/10
Precision: 0.75, Recall: 0.46, F1 Score: 0.57
Run 10/10
Precision: 0.46, Recall: 0.46, F1 Score: 0.46

Mean Training Time: 44841.20 ms (± 15131.03)
Mean Prediction Time: 0.11 ms (± 0.03)
Mean Precision: 0.47 (± 0.13)
Mean Recall: 0.46 (± 0.13)
Mean F1 Score: 0.46 (± 0.12)


In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import random

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Unseen examples directory
unseen_examples_dir = "/content/unseen_examples"  # Modify the path to unseen examples
unseen_files = [file for file in os.listdir(unseen_examples_dir) if os.path.isfile(os.path.join(unseen_examples_dir, file))]

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate precision, recall, and F1
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return precision, recall, f1

# Define the number of iterations for the experiment
n_runs = 3
k = 3  # Number of neighbors to consider
train_times = []
prediction_times = []
precisions, recalls, f1_scores = [], [], []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples
    num_samples = min(len(positive_files), len(negative_files))

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)

    program_embeddings = []
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.vstack(program_embeddings)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(program_embeddings, true_labels, test_size=0.3, random_state=42)

    # Start training time
    start_train_time = time.time()

    # Train kNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # End training time
    end_train_time = time.time()
    train_time = (end_train_time - start_train_time) * 1e6  # Convert to microseconds
    train_times.append(train_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict on test set
    y_pred = knn.predict(X_test)

    # End prediction time
    end_prediction_time = time.time()
    pred_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(pred_time)

    # Calculate metrics
    precision, recall, f1 = calculate_metrics(y_test, y_pred)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate mean and standard deviation for precision, recall, f1-score
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

# Calculate the mean training and prediction times
mean_train_time = np.mean(train_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")
print(f"\nMean Training Time: {mean_train_time:.2f} µs")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs")

# Unseen example prediction
unseen_embeddings = []
for file in unseen_files:
    with open(os.path.join(unseen_examples_dir, file), "r", encoding="utf-8", errors="ignore") as f:
        java_code = f.read()
    embedding = get_line_embeddings(java_code)
    unseen_embeddings.append(embedding)

unseen_embeddings = np.vstack(unseen_embeddings)

# Start unseen prediction time
start_unseen_pred_time = time.time()

# Predict on unseen examples
unseen_predictions = knn.predict(unseen_embeddings)

# End unseen prediction time
end_unseen_pred_time = time.time()
unseen_pred_time = (end_unseen_pred_time - start_unseen_pred_time) * 1e6  # Convert to microseconds

print(f"\nUnseen Prediction Time: {unseen_pred_time:.2f} µs")


Run 1/3
Precision: 0.88, Recall: 1.00, F1 Score: 0.93
Run 2/3
Precision: 0.64, Recall: 1.00, F1 Score: 0.78
Run 3/3
Precision: 0.71, Recall: 0.71, F1 Score: 0.71

Mean Precision: 0.74 (±0.10)
Mean Recall: 0.90 (±0.13)
Mean F1 Score: 0.81 (±0.09)

Mean Training Time: 795.36 µs
Mean Prediction Time: 26303.37 µs

Unseen Prediction Time: 1308.92 µs


**Singleton Standard deviation and time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import random
import joblib  # To save and load models

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Unseen examples directory
unseen_examples_dir = "/content/unseen_examples"  # Modify the path to unseen examples
unseen_files = [file for file in os.listdir(unseen_examples_dir) if os.path.isfile(os.path.join(unseen_examples_dir, file))]

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate precision, recall, and F1
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return precision, recall, f1

# Define the number of iterations for the experiment
n_runs = 3
k = 3  # Number of neighbors to consider
train_times = []
prediction_times = []
precisions, recalls, f1_scores = [], [], []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples
    num_samples = min(len(positive_files), len(negative_files))

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)

    # Start training time, including tokenization, embedding extraction, and k-NN
    start_train_time = time.time()

    program_embeddings = []
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.vstack(program_embeddings)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(program_embeddings, true_labels, test_size=0.3, random_state=42)

    # Train kNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Save the trained kNN model (Optional)
    joblib.dump(knn, f"knn_model_run_{run}.joblib")

    # End training time after training and model saving
    end_train_time = time.time()
    train_time = (end_train_time - start_train_time) * 1e6  # Convert to microseconds
    train_times.append(train_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict on test set
    y_pred = knn.predict(X_test)

    # End prediction time after prediction
    end_prediction_time = time.time()
    pred_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(pred_time)

    # Calculate metrics
    precision, recall, f1 = calculate_metrics(y_test, y_pred)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate mean and standard deviation for precision, recall, f1-score
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

# Calculate the mean training and prediction times
mean_train_time = np.mean(train_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")
print(f"\nMean Training Time: {mean_train_time:.2f} µs")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs")

# Unseen example prediction
unseen_embeddings = []
for file in unseen_files:
    with open(os.path.join(unseen_examples_dir, file), "r", encoding="utf-8", errors="ignore") as f:
        java_code = f.read()
    embedding = get_line_embeddings(java_code)
    unseen_embeddings.append(embedding)

unseen_embeddings = np.vstack(unseen_embeddings)

# Start unseen prediction time
start_unseen_pred_time = time.time()

# Predict on unseen examples
unseen_predictions = knn.predict(unseen_embeddings)

# End unseen prediction time
end_unseen_pred_time = time.time()
unseen_pred_time = (end_unseen_pred_time - start_unseen_pred_time) * 1e6  # Convert to microseconds

print(f"\nUnseen Prediction Time: {unseen_pred_time:.2f} µs")




Run 1/3
Precision: 0.83, Recall: 0.71, F1 Score: 0.77
Run 2/3
Precision: 1.00, Recall: 0.86, F1 Score: 0.92
Run 3/3
Precision: 0.78, Recall: 1.00, F1 Score: 0.88

Mean Precision: 0.87 (±0.09)
Mean Recall: 0.86 (±0.12)
Mean F1 Score: 0.86 (±0.06)

Mean Training Time: 104153313.16 µs
Mean Prediction Time: 1282.77 µs

Unseen Prediction Time: 1743.79 µs


In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import random
import joblib  # To save and load models

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Unseen examples directory
unseen_examples_dir = "/content/unseen_examples"  # Modify the path to unseen examples
unseen_files = [file for file in os.listdir(unseen_examples_dir) if os.path.isfile(os.path.join(unseen_examples_dir, file))]

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate precision, recall, and F1
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return precision, recall, f1

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
train_times = []
prediction_times = []
precisions, recalls, f1_scores = [], [], []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "singleton" in file]
    negative_files = [file for file in java_files if "singleton" not in file]

    # Select an approximately equal number of positive and negative examples
    num_samples = min(len(positive_files), len(negative_files))

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)

    # Start training time, including tokenization, embedding extraction, and k-NN
    start_train_time = time.time()

    program_embeddings = []
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.vstack(program_embeddings)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(program_embeddings, true_labels, test_size=0.3, random_state=42)

    # Train kNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Save the trained kNN model (Optional)
    joblib.dump(knn, f"knn_model_run_{run}.joblib")

    # End training time after training and model saving
    end_train_time = time.time()
    train_time = (end_train_time - start_train_time) * 1e6  # Convert to microseconds
    train_times.append(train_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict on test set
    y_pred = knn.predict(X_test)

    # End prediction time after prediction
    end_prediction_time = time.time()
    pred_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(pred_time)

    # Calculate metrics
    precision, recall, f1 = calculate_metrics(y_test, y_pred)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate mean and standard deviation for precision, recall, f1-score
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

# Calculate the mean training and prediction times
mean_train_time = np.mean(train_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")
print(f"\nMean Training Time: {mean_train_time:.2f} µs")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs")

# Unseen example prediction
unseen_embeddings = []
for file in unseen_files:
    with open(os.path.join(unseen_examples_dir, file), "r", encoding="utf-8", errors="ignore") as f:
        java_code = f.read()
    embedding = get_line_embeddings(java_code)
    unseen_embeddings.append(embedding)

unseen_embeddings = np.vstack(unseen_embeddings)

# Start unseen prediction time
start_unseen_pred_time = time.time()

# Predict on unseen examples
unseen_predictions = knn.predict(unseen_embeddings)

# End unseen prediction time
end_unseen_pred_time = time.time()
unseen_pred_time = (end_unseen_pred_time - start_unseen_pred_time) * 1e6  # Convert to microseconds

print(f"\nUnseen Prediction Time: {unseen_pred_time:.2f} µs")




Run 1/10
Precision: 0.86, Recall: 0.86, F1 Score: 0.86
Run 2/10
Precision: 0.78, Recall: 1.00, F1 Score: 0.88
Run 3/10
Precision: 0.83, Recall: 0.71, F1 Score: 0.77
Run 4/10
Precision: 0.75, Recall: 0.86, F1 Score: 0.80
Run 5/10
Precision: 0.70, Recall: 1.00, F1 Score: 0.82
Run 6/10
Precision: 0.62, Recall: 0.71, F1 Score: 0.67
Run 7/10
Precision: 0.88, Recall: 1.00, F1 Score: 0.93
Run 8/10
Precision: 0.62, Recall: 0.71, F1 Score: 0.67
Run 9/10
Precision: 0.57, Recall: 0.57, F1 Score: 0.57
Run 10/10
Precision: 0.75, Recall: 0.86, F1 Score: 0.80

Mean Precision: 0.74 (±0.10)
Mean Recall: 0.83 (±0.14)
Mean F1 Score: 0.78 (±0.11)

Mean Training Time: 93027561.69 µs
Mean Prediction Time: 1483.44 µs

Unseen Prediction Time: 1807.93 µs


**Builder Standard deviation and time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import random
import joblib  # To save and load models

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Unseen examples directory
unseen_examples_dir = "/content/unseen_examples"  # Modify the path to unseen examples
unseen_files = [file for file in os.listdir(unseen_examples_dir) if os.path.isfile(os.path.join(unseen_examples_dir, file))]

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate precision, recall, and F1
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return precision, recall, f1

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
train_times = []
prediction_times = []
precisions, recalls, f1_scores = [], [], []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "builder" in file]
    negative_files = [file for file in java_files if "builder" not in file]

    # Select an approximately equal number of positive and negative examples
    num_samples = min(len(positive_files), len(negative_files))

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)

    # Start training time, including tokenization, embedding extraction, and k-NN
    start_train_time = time.time()

    program_embeddings = []
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.vstack(program_embeddings)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(program_embeddings, true_labels, test_size=0.3, random_state=42)

    # Train kNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Save the trained kNN model (Optional)
    joblib.dump(knn, f"knn_model_run_{run}.joblib")

    # End training time after training and model saving
    end_train_time = time.time()
    train_time = (end_train_time - start_train_time) * 1e6  # Convert to microseconds
    train_times.append(train_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict on test set
    y_pred = knn.predict(X_test)

    # End prediction time after prediction
    end_prediction_time = time.time()
    pred_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(pred_time)

    # Calculate metrics
    precision, recall, f1 = calculate_metrics(y_test, y_pred)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate mean and standard deviation for precision, recall, f1-score
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

# Calculate the mean training and prediction times
mean_train_time = np.mean(train_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")
print(f"\nMean Training Time: {mean_train_time:.2f} µs")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs")

# Unseen example prediction
unseen_embeddings = []
for file in unseen_files:
    with open(os.path.join(unseen_examples_dir, file), "r", encoding="utf-8", errors="ignore") as f:
        java_code = f.read()
    embedding = get_line_embeddings(java_code)
    unseen_embeddings.append(embedding)

unseen_embeddings = np.vstack(unseen_embeddings)

# Start unseen prediction time
start_unseen_pred_time = time.time()

# Predict on unseen examples
unseen_predictions = knn.predict(unseen_embeddings)

# End unseen prediction time
end_unseen_pred_time = time.time()
unseen_pred_time = (end_unseen_pred_time - start_unseen_pred_time) * 1e6  # Convert to microseconds

print(f"\nUnseen Prediction Time: {unseen_pred_time:.2f} µs")


Run 1/10
Precision: 1.00, Recall: 1.00, F1 Score: 1.00
Run 2/10
Precision: 1.00, Recall: 0.00, F1 Score: 0.00
Run 3/10
Precision: 1.00, Recall: 0.40, F1 Score: 0.57
Run 4/10
Precision: 1.00, Recall: 0.60, F1 Score: 0.75
Run 5/10
Precision: 0.75, Recall: 0.60, F1 Score: 0.67
Run 6/10
Precision: 1.00, Recall: 0.40, F1 Score: 0.57
Run 7/10
Precision: 1.00, Recall: 0.40, F1 Score: 0.57
Run 8/10
Precision: 1.00, Recall: 0.40, F1 Score: 0.57
Run 9/10
Precision: 1.00, Recall: 0.40, F1 Score: 0.57
Run 10/10
Precision: 1.00, Recall: 0.60, F1 Score: 0.75

Mean Precision: 0.97 (±0.07)
Mean Recall: 0.48 (±0.24)
Mean F1 Score: 0.60 (±0.24)

Mean Training Time: 44398709.18 µs
Mean Prediction Time: 5810.69 µs

Unseen Prediction Time: 1404.05 µs


In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/builder'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/builder' has been deleted.


**Prototype, Standard devation, mean training and prediction time**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import random
import joblib  # To save and load models

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Unseen examples directory
unseen_examples_dir = "/content/unseen_examples"  # Modify the path to unseen examples
unseen_files = [file for file in os.listdir(unseen_examples_dir) if os.path.isfile(os.path.join(unseen_examples_dir, file))]

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate precision, recall, and F1
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return precision, recall, f1

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
train_times = []
prediction_times = []
precisions, recalls, f1_scores = [], [], []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "prototype" in file]
    negative_files = [file for file in java_files if "prototype" not in file]

    # Select an approximately equal number of positive and negative examples
    num_samples = min(len(positive_files), len(negative_files))

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)

    # Start training time, including tokenization, embedding extraction, and k-NN
    start_train_time = time.time()

    program_embeddings = []
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.vstack(program_embeddings)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(program_embeddings, true_labels, test_size=0.3, random_state=42)

    # Train kNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Save the trained kNN model (Optional)
    joblib.dump(knn, f"knn_model_run_{run}.joblib")

    # End training time after training and model saving
    end_train_time = time.time()
    train_time = (end_train_time - start_train_time) * 1e6  # Convert to microseconds
    train_times.append(train_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict on test set
    y_pred = knn.predict(X_test)

    # End prediction time after prediction
    end_prediction_time = time.time()
    pred_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(pred_time)

    # Calculate metrics
    precision, recall, f1 = calculate_metrics(y_test, y_pred)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate mean and standard deviation for precision, recall, f1-score
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

# Calculate the mean training and prediction times
mean_train_time = np.mean(train_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")
print(f"\nMean Training Time: {mean_train_time:.2f} µs")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs")

# Unseen example prediction
unseen_embeddings = []
for file in unseen_files:
    with open(os.path.join(unseen_examples_dir, file), "r", encoding="utf-8", errors="ignore") as f:
        java_code = f.read()
    embedding = get_line_embeddings(java_code)
    unseen_embeddings.append(embedding)

unseen_embeddings = np.vstack(unseen_embeddings)

# Start unseen prediction time
start_unseen_pred_time = time.time()

# Predict on unseen examples
unseen_predictions = knn.predict(unseen_embeddings)

# End unseen prediction time
end_unseen_pred_time = time.time()
unseen_pred_time = (end_unseen_pred_time - start_unseen_pred_time) * 1e6  # Convert to microseconds

print(f"\nUnseen Prediction Time: {unseen_pred_time:.2f} µs")




Run 1/10
Precision: 1.00, Recall: 0.90, F1 Score: 0.95
Run 2/10
Precision: 0.82, Recall: 0.90, F1 Score: 0.86
Run 3/10
Precision: 1.00, Recall: 0.90, F1 Score: 0.95
Run 4/10
Precision: 1.00, Recall: 0.70, F1 Score: 0.82
Run 5/10
Precision: 1.00, Recall: 1.00, F1 Score: 1.00
Run 6/10
Precision: 0.88, Recall: 0.70, F1 Score: 0.78
Run 7/10
Precision: 0.71, Recall: 1.00, F1 Score: 0.83
Run 8/10
Precision: 0.78, Recall: 0.70, F1 Score: 0.74
Run 9/10
Precision: 1.00, Recall: 0.80, F1 Score: 0.89
Run 10/10
Precision: 0.91, Recall: 1.00, F1 Score: 0.95

Mean Precision: 0.91 (±0.10)
Mean Recall: 0.86 (±0.12)
Mean F1 Score: 0.88 (±0.08)

Mean Training Time: 105866108.61 µs
Mean Prediction Time: 1460.65 µs

Unseen Prediction Time: 1691.10 µs


In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/prototype'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/prototype' has been deleted.


**Abstract Factory, Standard deviation, training time, prediction time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import random
import joblib  # To save and load models

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Unseen examples directory
unseen_examples_dir = "/content/unseen_examples"  # Modify the path to unseen examples
unseen_files = [file for file in os.listdir(unseen_examples_dir) if os.path.isfile(os.path.join(unseen_examples_dir, file))]

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate precision, recall, and F1
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return precision, recall, f1

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
train_times = []
prediction_times = []
precisions, recalls, f1_scores = [], [], []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "abstractfactory" in file]
    negative_files = [file for file in java_files if "abstractfactory" not in file]

    # Select an approximately equal number of positive and negative examples
    num_samples = min(len(positive_files), len(negative_files))

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)

    # Start training time, including tokenization, embedding extraction, and k-NN
    start_train_time = time.time()

    program_embeddings = []
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.vstack(program_embeddings)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(program_embeddings, true_labels, test_size=0.3, random_state=42)

    # Train kNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Save the trained kNN model (Optional)
    joblib.dump(knn, f"knn_model_run_{run}.joblib")

    # End training time after training and model saving
    end_train_time = time.time()
    train_time = (end_train_time - start_train_time) * 1e6  # Convert to microseconds
    train_times.append(train_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict on test set
    y_pred = knn.predict(X_test)

    # End prediction time after prediction
    end_prediction_time = time.time()
    pred_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(pred_time)

    # Calculate metrics
    precision, recall, f1 = calculate_metrics(y_test, y_pred)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate mean and standard deviation for precision, recall, f1-score
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

# Calculate the mean training and prediction times
mean_train_time = np.mean(train_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")
print(f"\nMean Training Time: {mean_train_time:.2f} µs")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs")

# Unseen example prediction
unseen_embeddings = []
for file in unseen_files:
    with open(os.path.join(unseen_examples_dir, file), "r", encoding="utf-8", errors="ignore") as f:
        java_code = f.read()
    embedding = get_line_embeddings(java_code)
    unseen_embeddings.append(embedding)

unseen_embeddings = np.vstack(unseen_embeddings)

# Start unseen prediction time
start_unseen_pred_time = time.time()

# Predict on unseen examples
unseen_predictions = knn.predict(unseen_embeddings)

# End unseen prediction time
end_unseen_pred_time = time.time()
unseen_pred_time = (end_unseen_pred_time - start_unseen_pred_time) * 1e6  # Convert to microseconds

print(f"\nUnseen Prediction Time: {unseen_pred_time:.2f} µs")




Run 1/10
Precision: 1.00, Recall: 1.00, F1 Score: 1.00
Run 2/10
Precision: 1.00, Recall: 1.00, F1 Score: 1.00
Run 3/10
Precision: 1.00, Recall: 1.00, F1 Score: 1.00
Run 4/10
Precision: 0.83, Recall: 1.00, F1 Score: 0.91
Run 5/10
Precision: 1.00, Recall: 1.00, F1 Score: 1.00
Run 6/10
Precision: 0.83, Recall: 1.00, F1 Score: 0.91
Run 7/10
Precision: 1.00, Recall: 1.00, F1 Score: 1.00
Run 8/10
Precision: 0.83, Recall: 1.00, F1 Score: 0.91
Run 9/10
Precision: 0.67, Recall: 0.80, F1 Score: 0.73
Run 10/10
Precision: 0.83, Recall: 1.00, F1 Score: 0.91

Mean Precision: 0.90 (±0.11)
Mean Recall: 0.98 (±0.06)
Mean F1 Score: 0.94 (±0.08)

Mean Training Time: 52127080.18 µs
Mean Prediction Time: 1366.95 µs

Unseen Prediction Time: 1811.50 µs


In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/abstractfactory'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/abstractfactory' has been deleted.


**Factory method, Standard deviation, training time, prediction time calculation**

In [None]:
import os
import torch
import time
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import random
import joblib  # To save and load models

# Check if GPU is available and use it; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load your Java programs from a directory
java_code_dir = "/content/factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)  # Move model to the appropriate device
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Unseen examples directory
unseen_examples_dir = "/content/unseen_examples"  # Modify the path to unseen examples
unseen_files = [file for file in os.listdir(unseen_examples_dir) if os.path.isfile(os.path.join(unseen_examples_dir, file))]

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        if line.strip() == "":  # Skip empty lines
            continue

        # Encode the input using the tokenizer
        inputs = tokenizer(line, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)  # Move inputs to device

        # Forward pass through the encoder model
        with torch.no_grad():
            encoder_outputs = model.encoder(**inputs)  # Use only the encoder part

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = encoder_outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move output to CPU for NumPy conversion
        line_embeddings.append(line_embedding)

    if len(line_embeddings) == 0:  # To handle files with no valid lines
        return np.zeros((1, model.config.hidden_size))

    return np.mean(line_embeddings, axis=0)

# Function to calculate precision, recall, and F1
def calculate_metrics(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)
    return precision, recall, f1

# Define the number of iterations for the experiment
n_runs = 10
k = 3  # Number of neighbors to consider
train_times = []
prediction_times = []
precisions, recalls, f1_scores = [], [], []

for run in range(n_runs):
    print(f"Run {run + 1}/{n_runs}")

    # Separate positive and negative examples
    positive_files = [file for file in java_files if "factorymethod" in file]
    negative_files = [file for file in java_files if "factorymethod" not in file]

    # Select an approximately equal number of positive and negative examples
    num_samples = min(len(positive_files), len(negative_files))

    positive_sample = random.sample(positive_files, num_samples)
    negative_sample = random.sample(negative_files, num_samples)

    selected_files = positive_sample + negative_sample
    true_labels = [1] * len(positive_sample) + [0] * len(negative_sample)

    # Start training time, including tokenization, embedding extraction, and k-NN
    start_train_time = time.time()

    program_embeddings = []
    for file in selected_files:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

    program_embeddings = np.vstack(program_embeddings)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(program_embeddings, true_labels, test_size=0.3, random_state=42)

    # Train kNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    # Save the trained kNN model (Optional)
    joblib.dump(knn, f"knn_model_run_{run}.joblib")

    # End training time after training and model saving
    end_train_time = time.time()
    train_time = (end_train_time - start_train_time) * 1e6  # Convert to microseconds
    train_times.append(train_time)

    # Start prediction time
    start_prediction_time = time.time()

    # Predict on test set
    y_pred = knn.predict(X_test)

    # End prediction time after prediction
    end_prediction_time = time.time()
    pred_time = (end_prediction_time - start_prediction_time) * 1e6  # Convert to microseconds
    prediction_times.append(pred_time)

    # Calculate metrics
    precision, recall, f1 = calculate_metrics(y_test, y_pred)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

# Calculate mean and standard deviation for precision, recall, f1-score
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

# Calculate the mean training and prediction times
mean_train_time = np.mean(train_times)
mean_prediction_time = np.mean(prediction_times)

print(f"\nMean Precision: {mean_precision:.2f} (±{std_precision:.2f})")
print(f"Mean Recall: {mean_recall:.2f} (±{std_recall:.2f})")
print(f"Mean F1 Score: {mean_f1:.2f} (±{std_f1:.2f})")
print(f"\nMean Training Time: {mean_train_time:.2f} µs")
print(f"Mean Prediction Time: {mean_prediction_time:.2f} µs")

# Unseen example prediction
unseen_embeddings = []
for file in unseen_files:
    with open(os.path.join(unseen_examples_dir, file), "r", encoding="utf-8", errors="ignore") as f:
        java_code = f.read()
    embedding = get_line_embeddings(java_code)
    unseen_embeddings.append(embedding)

unseen_embeddings = np.vstack(unseen_embeddings)

# Start unseen prediction time
start_unseen_pred_time = time.time()

# Predict on unseen examples
unseen_predictions = knn.predict(unseen_embeddings)

# End unseen prediction time
end_unseen_pred_time = time.time()
unseen_pred_time = (end_unseen_pred_time - start_unseen_pred_time) * 1e6  # Convert to microseconds

print(f"\nUnseen Prediction Time: {unseen_pred_time:.2f} µs")




Run 1/10
Precision: 1.00, Recall: 0.75, F1 Score: 0.86
Run 2/10
Precision: 0.60, Recall: 0.75, F1 Score: 0.67
Run 3/10
Precision: 1.00, Recall: 0.50, F1 Score: 0.67
Run 4/10
Precision: 1.00, Recall: 0.50, F1 Score: 0.67
Run 5/10
Precision: 0.50, Recall: 0.75, F1 Score: 0.60
Run 6/10
Precision: 0.50, Recall: 0.50, F1 Score: 0.50
Run 7/10
Precision: 0.75, Recall: 0.75, F1 Score: 0.75
Run 8/10
Precision: 1.00, Recall: 0.75, F1 Score: 0.86
Run 9/10
Precision: 1.00, Recall: 0.75, F1 Score: 0.86
Run 10/10
Precision: 0.60, Recall: 0.75, F1 Score: 0.67

Mean Precision: 0.79 (±0.21)
Mean Recall: 0.68 (±0.11)
Mean F1 Score: 0.71 (±0.11)

Mean Training Time: 51717682.08 µs
Mean Prediction Time: 1280.50 µs

Unseen Prediction Time: 1487.49 µs
