In [2]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    with open(os.path.join(java_code_dir, file), "r", encoding="utf-8") as f:
        java_code = f.read()

        # Determine if it's a positive class ("prototype") or negative class
        if "singleton" in file:
            label = 1
        else:
            label = 0

        true_labels.append(label)

        # Get mean embedding for each line and store in program_embeddings
        program_embedding = get_line_embeddings(java_code)
        program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf8 in position 2945: invalid start byte

In [None]:
Singleton with different settings

In [4]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (42).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (68).java, Predicted Label: 

In [None]:
Singleton with different settings

In [6]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (42).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (68).java, Predicted Label: 

In [None]:
Singleton with different settings

In [1]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (29).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 0, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (42).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (68).java, Predicted Label: 

In [None]:
Singleton with different settings

In [3]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (54).java, Predicted Label: 0, True Label: 0
File: nons (25).java, Predicted Label: 1, True Label: 0
File: nons (18).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (68).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (3).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Pred

In [None]:
#Singleton with different settings

In [2]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "singleton"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "singleton" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nons (12).java, Predicted Label: 0, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 0
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 0, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (33).java, Predicted Label: 0, True Label: 0
File: nons (41).java, Predicted Label: 0, True Label: 0
File: nons (30).java, Predicted Label: 1, True Label: 0
File: singleton (1).java, 

In [None]:
#Builder with different settings

In [4]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (41).java, Predicted Label: 0, True Label: 0
File: nonb (115).java, Predicted Label: 0, True Label: 0
File: nonb (49).java, Predicted Label: 0, True Label: 0
File: nonb (50).java, Predicted Label: 0, True Label: 0
File: nonb (109).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (61).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (53).java, Predicted Label: 1, True Label: 0
File: nonb (51).java, Predicted Label: 0, True Label: 0
File: nonb (46).java, Predicted Label: 0, True Label: 0
File: nonb (58).java, Predicted Label: 0, True Label: 0
File: nonb (82).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (73).java, Predicted Label: 0, 

In [None]:
#Builder with different settings

In [5]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (115).java, Predicted Label: 0, True Label: 0
File: nonb (89).java, Predicted Label: 0, True Label: 0
File: nonb (3).java, Predicted Label: 0, True Label: 0
File: nonb (49).java, Predicted Label: 0, True Label: 0
File: nonb (28).java, Predicted Label: 0, True Label: 0
File: nonb (50).java, Predicted Label: 0, True Label: 0
File: nonb (103).java, Predicted Label: 0, True Label: 0
File: nonb (36).java, Predicted Label: 1, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: nonb (126).java, Predicted Label: 0, True Label: 0
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (44).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 1, True Label: 1
File: nonb (87).java, Predicted Label: 0, True Label: 0
File: nonb (120).java, Predicted Label: 0, True Label: 0
File: nonb (69).java, Predicted Label: 0, T

In [None]:
#Builder with different settings

In [6]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (98).java, Predicted Label: 0, True Label: 0
File: nonb (26).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: nonb (100).java, Predicted Label: 0, True Label: 0
File: nonb (128).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (19).java, Predicted Label: 0, True Label: 0
File: nonb (86).java, Predicted Label: 0, True Label: 0
File: nonb (85).java, Predicted Label: 0, True Label: 0
File: nonb (81).java, Predicted Label: 0, True Label: 0
File: nonb (82).java, Predicted Label: 0, True Label: 0
File: nonb (57).java, Predicted Label: 0, Tr

In [None]:
#Builder with different settings

In [7]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: nonb (77).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (26).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (61).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: nonb (128).java, Predicted Label: 1, True Label: 0
File: builder (3).java, Predicted Label: 0, True Label: 1
File: nonb (53).java, Predicted Label: 0, True Label: 0
File: nonb (19).java, Predicted Label: 0, True Label: 0
File: nonb (46).java, Predicted Label: 1, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: builder (9).java, Predicted Label: 1

In [None]:
#Builder with different settings

In [8]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "builder"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "builder" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonb (14).java, Predicted Label: 0, True Label: 0
File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (41).java, Predicted Label: 1, True Label: 0
File: nonb (5).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (110).java, Predicted Label: 0, True Label: 0
File: nonb (18).java, Predicted Label: 0, True Label: 0
File: nonb (17).java, Predicted Label: 0, True Label: 0
File: builder (3).java, Predicted Label: 1, True Label: 1
File: nonb (85).java, Predicted Label: 0, True Label: 0
File: builder (7).java, Predicted Label: 1, True Label: 1
File: nonb (92).java, Predicted Label: 0, True Label: 0
File: builder (9).java, Predicted Label: 1, True Label: 1
File: nonb (13).java, Predicted Label: 0, True Label: 0
File: builder (8).java, Predicted Label: 1, True Label: 1
File: nonb (23).java, Predicted Label: 0

In [None]:
#Factory Method with different settings

In [11]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (52).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: nonfm (68).java, Predicted Label: 0, True Label: 0
File: nonfm (37).java, Predicted Label: 0, True Label: 0
File: nonfm (29).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 0, True Label: 1
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: nonfm (49).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 1, True Label:

In [None]:
#Factory Method with different settings

In [13]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (6).java, Predicted Label: 0, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (12).java, Predicted Label: 0, True Label: 0
File: nonfm (4).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (18).java, Predicted Label: 0, True Label: 0
File: nonfm (1).java, Predicted Label: 0, True Label: 0
File: nonfm (8).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (11).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
F

In [3]:
#Factory method with different settings

In [4]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (77).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (75).java, Predicted Label: 0, True Label: 0
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (78).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 1, True Label: 1
File: nonfm (82).java, Predicted Label: 0, True Label: 0
File: nonfm (74).java, Predicted Label: 0, True Label: 0
File: factorymethod (3).java, Predicted Label

In [None]:
#Factory method with different settings

In [5]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (68).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (65).java, Predicted Label: 0, True Label: 0
File: nonfm (63).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: nonfm (40).java, Predicted Label: 1, True Label: 0
File: nonfm (47).java, Predicted Label: 0, True Label: 0
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: factorymethod (4).java, Predicted Label: 0, True Label: 1
File: nonfm (43).java, Predicted Label: 0, True Label: 0
File: factorymethod (3).java, Predicted Label: 1, True Label: 1
File: nonfm (53).java, Predicted Label

In [None]:
#Factory Method with different settings

In [6]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "factorymethod"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "factorymethod" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (77).java, Predicted Label: 0, True Label: 0
File: nonfm (80).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 0, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
File: nonfm (79).java, Predicted Label: 0, True Label: 0
File: factorymethod (5).java, Predicted Label: 1, True Label: 1
File: factorymethod (2).java, Predicted Label: 1, True Label: 1
File: nonfm (75).java, Predicted Label: 0, True Label: 0
File: nonfm (72).java, Predicted Label: 0, True Label: 0
File: factorymethod (10).java, Predicted Label: 1, True Label: 1
File: factorymethod (7).java, Predicted Label: 0, True Label: 1
File: nonfm (70).java, Predicted Label: 0, True Label: 0
File: nonfm (78).java, Predicted Label: 0, True Label: 0
File: factorymethod (4).java, Predicted Label: 1, True Label: 1
File: nonfm (82).java, Predicted Label: 0, True Label: 0
File: nonfm (74).java, Predicted Label: 0, Tr

In [None]:
#Abstract Factory with different settings

In [14]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nondp (7).java, Predicted Label: 0, True Label: 0
File: nondp (2).java, Predicted Label: 0, True Label: 0
File: nondp (18).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (33).java, Predicted Label: 0, True Label: 0
File: nondp (29).java, Predicted Label: 0, True Label: 0
File: nondp (31).java, Predicted Label: 0, True Label: 0
File: nondp (11).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 0, True Label: 0
File: nondp (45).java, Predicted Label: 0, True Label: 0
File: nondp (19).java, Predicted Label: 0, True Label: 0
File: nondp (23).java, Predicted Label: 0, True Label: 0
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 1, True Label: 0
File: nondp (26).java, Predicted Label: 0, True Label: 0
File: nondp (6).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java,

In [None]:
#Abstract Factory with different settings

In [4]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonfm (31).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonfm (2).java, Predicted Label: 0, True Label: 0
File: nonfm (30).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonfm (20).java, Predicted Label: 1, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 0, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonfm (29).java, Predicted Label: 1, True Label: 0
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
Fi

In [None]:
#Abstract Factory with different settings

In [7]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1

In [None]:
#Abstract Factory with different settings

In [9]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (48).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1

In [None]:
#Abstract Factory with different settings

In [12]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (30).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Predicted Label: 1, True Label: 1
File: abstractfactory (12).java, Predicted Label: 1, True Label: 1
File: nonab (14).java, Predicted Label: 0, True Label: 0
File: nonab (80).java, Predicted Label: 0, True Label: 0
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
Fi

In [None]:
#Abstract Factory with different settings

In [11]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (33).java, Predicted Label: 1, True Label: 0
File: nondp (31).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 1, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted

In [None]:
#Abstract Factory with different settings

In [13]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 1, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (65).java, Predicted Label: 1, True Label: 0
File: nonab (56).java, Predicted Label: 0, True Label: 0
File: nonab (15).java, Predicted Label: 0, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: nonab (51).java, Predicted Label: 0, True Label: 0
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: nonab (26).java, Predicted

In [None]:
#Abstract Factory with different settings

In [14]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "abstractfactory"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "abstractfactory" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: nonab (72).java, Predicted Label: 0, True Label: 0
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (19).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 0, True Label: 0
File: nonab (1).java, Predicted Label: 1, True Label: 0
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
File: abstractfactory (9).java, Predicted Label: 1, True Label: 1
File: abstractfactory (13).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (1).java, Pr

In [None]:
#Prototype with different settings

In [7]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 1, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label

In [8]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: non-DP (44).java, Predicted Label: 0, True Label: 0
File: non-DP (34).java, Predicted Label: 0, True Label: 0
File: non-DP (29).java, Predicted Label: 0, True Label: 0
File: non-DP (35).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: non-DP (48).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: non-DP (25).java, Predicted Label: 0, True Label: 0
File: non-DP (45).java, Predicted Label: 0, True Label: 0
File: non-DP (39).java, Predicted Label: 0, True Label: 0
File: non-DP (2).java, Predicted Label: 0, True Label: 0
File: non-DP (33).java, Predicted Label: 0, True Label: 0
File: prototype (14).java, Predicted Label: 0, True Label: 1
File: prototype (16).java, Predicted Label: 1, True Label: 1
File:

In [None]:
#Prototype with different settings

In [1]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for KNN
flattened_embeddings = np.vstack(program_embeddings)

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(flattened_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 3  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [true_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Calculate precision, recall, and f-score
precision = precision_score(true_labels, predicted_labels, zero_division=1)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the KNN results and performance metrics
print("K-Nearest Neighbors (KNN) Classification Results:")
for i, java_file in enumerate(java_files):
    print(f"File: {java_file}, Predicted Label: {predicted_labels[i]}, True Label: {true_labels[i]}")

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")


K-Nearest Neighbors (KNN) Classification Results:
File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (53).java, Predicted Label: 0, True Label: 0
File: nonp (23).java, Predicted Label: 0, True Label: 0
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 1, True Label: 1
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 1, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label

In [None]:
#tNSE plot for design patterns

In [8]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code, model, tokenizer):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Function to perform t-SNE visualization
def perform_tsne(embeddings, labels):
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Create a scatter plot for t-SNE visualization
    plt.figure(figsize=(10, 8))

    sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=labels, palette="dark", s=50, alpha=0.7)

    plt.title('t-SNE Visualization for CodeT5 on Different Classes', fontsize=16)
    plt.xlabel('t-SNE Dimension 1', fontsize=14)
    plt.ylabel('t-SNE Dimension 2', fontsize=14)
    plt.legend(title='Class', loc='upper right', fontsize=12)
    plt.grid(True)

    plt.show()

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code, model, tokenizer)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for t-SNE
flattened_embeddings = np.vstack(program_embeddings)

# Perform t-SNE visualization
perform_tsne(flattened_embeddings, true_labels)


In [1]:
#Prototype with different settings

In [4]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to tokenize and get embeddings for each line
def get_line_embeddings(java_code, model, tokenizer):
    lines = java_code.split('\n')
    line_embeddings = []

    for line in lines:
        # Encode the input using the T5 tokenizer
        inputs = tokenizer("translate English to Java: " + line, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Add a dummy decoder input
        inputs["decoder_input_ids"] = inputs["input_ids"]

        # Forward pass through the T5 model
        with torch.no_grad():
            outputs = model(**inputs)

        # Use the 'last_hidden_state' attribute for embeddings
        line_embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        line_embeddings.append(line_embedding)

    return np.mean(line_embeddings, axis=0)

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Function to perform t-SNE visualization
def perform_tsne(embeddings, labels):
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Create a scatter plot for t-SNE visualization
    plt.figure(figsize=(10, 8))

    sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], hue=labels, palette="dark", s=50, alpha=0.7)

    plt.title('t-SNE Visualization for CodeT5 on Different Classes', fontsize=16)
    plt.xlabel('t-SNE Dimension 1', fontsize=14)
    plt.ylabel('t-SNE Dimension 2', fontsize=14)
    plt.legend(title='Class', loc='upper right', fontsize=12)
    plt.grid(True)

    plt.show()

# Load your Java programs from a directory
java_code_dir = "prototype"  # Modify the directory path to your dataset
java_files = [file for file in os.listdir(java_code_dir) if os.path.isfile(os.path.join(java_code_dir, file))]

# Load the CodeT5 model and tokenizer
model_name = "Salesforce/codet5-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the Java programs
program_embeddings = []

# Define the true labels for each program
true_labels = []

for file in java_files:
    try:
        with open(os.path.join(java_code_dir, file), "r", encoding="utf-8", errors="ignore") as f:
            java_code = f.read()
    except UnicodeDecodeError as e:
        print(f"Error decoding file {file}: {e}")
        continue  # Skip to the next file if decoding fails

    # Determine if it's a positive class ("prototype") or negative class
    if "prototype" in file:
        label = 1
    else:
        label = 0

    true_labels.append(label)

    # Get mean embedding for each line and store in program_embeddings
    program_embedding = get_line_embeddings(java_code, model, tokenizer)
    program_embeddings.append(program_embedding)

# Flatten the embeddings for t-SNE
flattened_embeddings = np.vstack(program_embeddings)

# Perform t-SNE visualization
perform_tsne(flattened_embeddings, true_labels)
