In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (54).java, Predicted Label: 1, True Label: 1
File: nons (29).java, Predicted Label: 1, True Label: 0
File: nons (25).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (61).java, Predicted Label: 1, True Label: 1
Precision: 0.7777777777777778, Recall: 0.7777777777777778, F-score: 0.7777777777777778


In [None]:
Singleton on different setting of programs

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (54).java, Predicted Label: 1, True Label: 1
File: nons (25).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 0, True Label: 0
File: nons (19).java, Predicted Label: 0, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8


In [None]:
Singleton on different setting of programs

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (12).java, Predicted Label: 1, True Label: 1
File: nons (50).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (24).java, Predicted Label: 0, True Label: 0
File: nons (4).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Singleton with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 1, True Label: 0
File: nons (42).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 0, True Label: 0
File: nons (55).java, Predicted Label: 1, True Label: 1
File: nons (34).java, Predicted Label: 1, True Label: 1
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (61).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
Precision: 0.9142857142857143, Recall: 0.9, F-score: 0.8967032967032967


In [None]:
Builder with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 1, True Label: 1
File: nonbuilder (55).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonb (14).java, Predicted Label: 1, True Label: 1
File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 0, True Label: 0
File: builder (6).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (89).java, Predicted Label: 0, True Label: 0
File: nonb (49).java, Predicted Label: 1, True Label: 1
File: nonb (28).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonb (14).java, Predicted Label: 1, True Label: 1
File: builder (2).java, Predicted Label: 0, True Label: 0
File: nonb (133).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (18).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonab (11).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (81).java, Predicted Label: 1, True Label: 1
File: nonab (7).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonab (11).java, Predicted Label: 1, True Label: 1
File: nonab (48).java, Predicted Label: 1, True Label: 1
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 1, True Label: 1
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonab (11).java, Predicted Label: 1, True Label: 1
File: nonab (48).java, Predicted Label: 1, True Label: 1
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 1, True Label: 1
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: non-DP (37).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: non-DP (43).java, Predicted Label: 1, True Label: 1
File: non-DP (42).java, Predicted Label: 1, True Label: 1
File: non-DP (39).java, Predicted Label: 0, True Label: 0
File: non-DP (31).java, Predicted Label: 1, True Label: 1
File: non-DP (38).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (18).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (29).java, Predicted Label: 1, True Label: 1
File: nondp (31).java, Predicted Label: 1, True Label: 1
File: nondp (11).java, Predicted Label: 1, True Label: 1
File: nondp (23).java, Predicted Label: 1, True Label: 1
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (7).java, Predicted Label: 0, True Label: 0
File: nondp (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (33).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 1, True Label: 0
File: nondp (45).java, Predicted Label: 1, True Label: 1
File: nondp (19).java, Predicted Label: 0, True Label: 0
File: nondp (26).java, Predicted Label: 1, True Label: 1
Precision: 0.90625, Recall: 0.875, F-score: 0.876984126984127


In [None]:
Abstract Factory with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (7).java, Predicted Label: 0, True Label: 0
File: nondp (2).java, Predicted Label: 1, True Label: 1
File: nondp (18).java, Predicted Label: 0, True Label: 1
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nondp (33).java, Predicted Label: 1, True Label: 1
File: nondp (29).java, Predicted Label: 0, True Label: 0
File: nondp (31).java, Predicted Label: 1, True Label: 1
File: nondp (11).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 0, True Label: 0
File: nondp (45).java, Predicted Label: 0, True Label: 0
File: nondp (19).java, Predicted Label: 1, True Label: 1
File: nondp (23).java, Predicted Label: 0, True Label: 0
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 0, True Label: 0
Precision: 0.9357142857142857, Recall: 0.9285714285714286, F-score: 0.926482873851295


In [None]:
Factory Method with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'factorymethod'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "factorymethod" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (33).java, Predicted Label: 1, True Label: 1
File: nondp (5).java, Predicted Label: 1, True Label: 1
File: nondp (4).java, Predicted Label: 1, True Label: 1
File: nondp (1).java, Predicted Label: 0, True Label: 0
File: nondp (15).java, Predicted Label: 0, True Label: 0
File: nondp (34).java, Predicted Label: 1, True Label: 1
File: factorymethod (1).java, Predicted Label: 0, True Label: 0
File: factorymethod (9).java, Predicted Label: 0, True Label: 0
File: nondp (26).java, Predicted Label: 0, True Label: 0
File: nondp (8).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Factory Method with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'factorymethod'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "factorymethod" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonfm (6).java, Predicted Label: 1, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 0
File: factorymethod (9).java, Predicted Label: 1, True Label: 0
File: nonfm (4).java, Predicted Label: 1, True Label: 1
Precision: 0.875, Recall: 0.5, F-score: 0.5428571428571429


In [None]:
Factory Method with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'factorymethod'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "factorymethod" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonfm (38).java, Predicted Label: 0, True Label: 0
File: nonfm (52).java, Predicted Label: 1, True Label: 1
File: nonfm (68).java, Predicted Label: 1, True Label: 1
File: nonfm (37).java, Predicted Label: 1, True Label: 0
File: nonfm (29).java, Predicted Label: 1, True Label: 1
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
Precision: 0.880952380952381, Recall: 0.8571428571428571, F-score: 0.8398268398268397


In [None]:
Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'prototype'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "prototype" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (23).java, Predicted Label: 0, True Label: 1
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 0, True Label: 0
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 1, True Label: 1
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (58).java, Predicted Label: 1, True Label: 1
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: prototype (16).java, Predicted Label: 0, True Label: 0
Precision: 0.8846153846153846, Recall: 0.8461538461538461, F-score: 0.8443223443223442


In [None]:
Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'prototype'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "prototype" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: prototype (27).java, Predicted Label: 0, True Label: 0
File: nonp (53).java, Predicted Label: 0, True Label: 1
File: nonp (23).java, Predicted Label: 1, True Label: 1
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 0, True Label: 0
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 1, True Label: 1
File: nonp (29).java, Predicted Label: 1, True Label: 1
File: prototype (15).java, Predicted Label: 0, True Label: 0
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (37).java, Predicted

In [None]:
Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'prototype'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "prototype" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to get embeddings for a given design pattern
def get_embeddings_for_pattern(pattern, model, tokenizer):
    directory = os.path.join("all_design_patterns", pattern.lower())
    files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

    embeddings = []
    true_labels = []

    for file in files:
        with open(os.path.join(directory, file), "r", encoding="ISO-8859-1") as f:
            code = f.read()

        # Tokenize and encode the Java program
        inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        program_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        embeddings.append(program_embedding)
        true_labels.append(pattern)

    return np.array(embeddings), np.array(true_labels)

# Load the RoBERTa model and tokenizer
model_name = "microsoft/codebert-base"  # Replace with the correct RoBERTa model name
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get embeddings for each design pattern
patterns = ["Singleton", "Prototype", "AbstractFactory", "Builder", "FactoryMethod"]
all_embeddings = []
all_labels = []

# Custom color palette for each design pattern with higher contrast
color_palette = ["red", "green", "orange", "blue", "purple"]

# Custom markers for each design pattern
markers = ["o", "s", "D", "^", "P"]

for i, pattern in enumerate(patterns):
    pattern_embeddings, pattern_labels = get_embeddings_for_pattern(pattern, model, tokenizer)
    all_embeddings.append(pattern_embeddings)
    all_labels.append(pattern_labels)

# Concatenate the embeddings and labels
all_embeddings = np.concatenate(all_embeddings, axis=0)
all_labels = np.concatenate(all_labels)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(all_embeddings)

# Create a scatter plot for t-SNE visualization with custom symbols
plt.figure(figsize=(20, 16))

for i, pattern in enumerate(patterns):
    indices = all_labels == pattern
    sns.scatterplot(x=tsne_results[indices, 0], y=tsne_results[indices, 1], marker=markers[i], color=color_palette[i], s=200, label=pattern)

# Increase font sizes for better visibility
plt.title('t-SNE Visualization for RoBERTa on Different Design Patterns', fontsize=30)
plt.xlabel('t-SNE Dimension 1', fontsize=25)
plt.ylabel('t-SNE Dimension 2', fontsize=25)
plt.legend(title='Design Pattern', loc='upper right', fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Save the t-SNE plot as a PDF file
plt.savefig('tsne_plot_roberta.pdf', format='pdf')
plt.show()


In [None]:
#Singleton with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (12).java, Predicted Label: 1, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 0
File: nons (4).java, Predicted Label: 1, True Label: 1
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 1, True Label: 1
File: nons (38).java, Predicted Label: 1, True Label: 1
Precision: 0.5289256198347108, Recall: 0.7272727272727273, F-score: 0.6124401913875599


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Singleton with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    if "singleton" in file_name:
        return 1  # Positive class (implements singleton)
    else:
        return 0  # Negative class (does not implement singleton)

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Label the program using the provided function
            program_labels.append(label_program(program_file))

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, program_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (12).java, Predicted Label: 0, True Label: 1
File: nons (50).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 0, True Label: 1
File: singleton (24).java, Predicted Label: 0, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 1, True Label: 1
File: nons (38).java, Predicted Label: 1, True Label: 1
Precision: 0.7467532467532467, Recall: 0.6363636363636364, F-score: 0.6742424242424243


In [None]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'singleton/sin.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to singleton/sin.csv


In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'singleton/sin.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(program_embeddings, df['Label'], test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {df['File Name'].iloc[i]}, Predicted Label: {predictions[i]}, True Label: {y_test.iloc[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [06:34<00:00,  7.30s/it]

File: nons (12).java, Predicted Label: 1, True Label: 1
File: nons (50).java, Predicted Label: 0, True Label: 1
File: singleton (25).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 0, True Label: 1
File: singleton (24).java, Predicted Label: 0, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 1, True Label: 1
File: nons (38).java, Predicted Label: 1, True Label: 1
Precision: 0.890909090909091, Recall: 0.7272727272727273, F-score: 0.7584415584415585





In [None]:
#k-fold cross validation

In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'singleton/sin.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Initialize an array to store predicted labels for each program in every fold
all_fold_predictions = np.zeros_like(df['Label'])

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    fold_predictions = knn.predict(X_test)

    # Store fold predictions in the array
    all_fold_predictions[test_index] = fold_predictions

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, fold_predictions, average='weighted')
    recall = recall_score(y_test, fold_predictions, average='weighted')
    f1 = f1_score(y_test, fold_predictions, average='weighted')

    print(f"Fold {fold + 1}: Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Calculate overall precision, recall, and f-score
precision = precision_score(df['Label'], all_fold_predictions, average='weighted')
recall = recall_score(df['Label'], all_fold_predictions, average='weighted')
f1 = f1_score(df['Label'], all_fold_predictions, average='weighted')

print(f"Overall Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Print the classification results for each program in every fold
for fold in range(n_splits):
    fold_predictions = all_fold_predictions[test_index]
    true_labels = df['Label'][test_index]

    for i in range(len(fold_predictions)):
        print(f"Fold {fold + 1}, File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {fold_predictions[i]}, True Label: {true_labels.iloc[i]}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [06:37<00:00,  7.36s/it]

Fold 1: Precision: 0.9242424242424243, Recall: 0.9090909090909091, F-score: 0.9090909090909091
Fold 2: Precision: 0.7393939393939394, Recall: 0.7272727272727273, F-score: 0.7272727272727272
Fold 3: Precision: 0.8701298701298701, Recall: 0.8181818181818182, F-score: 0.8151515151515152
Fold 4: Precision: 0.5303030303030304, Recall: 0.5454545454545454, F-score: 0.4935064935064935
Fold 5: Precision: 0.8571428571428571, Recall: 0.8, F-score: 0.7916666666666666
Overall Precision: 0.7589506172839506, Recall: 0.7592592592592593, F-score: 0.7588428968574248
Fold 1, File: singleton (25).java, Predicted Label: 0, True Label: 1
Fold 1, File: singleton (9).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (44).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (38).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (33).java, Predicted Label: 0, True Label: 0
Fold 1, File: singleton (11).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (21).java, Predicted Label: 




In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'singleton/sin.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Initialize an array to store predicted labels for each program in every fold
all_fold_predictions = np.zeros_like(df['Label'])

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    fold_predictions = knn.predict(X_test)

    # Store fold predictions in the array
    all_fold_predictions[test_index] = fold_predictions

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, fold_predictions, average='weighted')
    recall = recall_score(y_test, fold_predictions, average='weighted')
    f1 = f1_score(y_test, fold_predictions, average='weighted')

    print(f"Fold {fold + 1}: Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Calculate overall precision, recall, and f-score
precision = precision_score(df['Label'], all_fold_predictions, average='weighted')
recall = recall_score(df['Label'], all_fold_predictions, average='weighted')
f1 = f1_score(df['Label'], all_fold_predictions, average='weighted')

print(f"Overall Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Print the classification results for each program in every fold
for fold in range(n_splits):
    fold_predictions = all_fold_predictions[test_index]
    true_labels = df['Label'][test_index]

    for i in range(len(fold_predictions)):
        print(f"Fold {fold + 1}, File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {fold_predictions[i]}, True Label: {true_labels.iloc[i]}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [06:39<00:00,  7.39s/it]

Fold 1: Precision: 0.9242424242424243, Recall: 0.9090909090909091, F-score: 0.9090909090909091
Fold 2: Precision: 0.7393939393939394, Recall: 0.7272727272727273, F-score: 0.7272727272727272
Fold 3: Precision: 0.8701298701298701, Recall: 0.8181818181818182, F-score: 0.8151515151515152
Fold 4: Precision: 0.5303030303030304, Recall: 0.5454545454545454, F-score: 0.4935064935064935
Fold 5: Precision: 0.8571428571428571, Recall: 0.8, F-score: 0.7916666666666666
Overall Precision: 0.7589506172839506, Recall: 0.7592592592592593, F-score: 0.7588428968574248
Fold 1, File: singleton (25).java, Predicted Label: 0, True Label: 1
Fold 1, File: singleton (9).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (44).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (38).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (33).java, Predicted Label: 0, True Label: 0
Fold 1, File: singleton (11).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (21).java, Predicted Label: 




In [None]:
#singleton with different settings

In [None]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin1.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to embeddings/sin1.csv


In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin1.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the training set
    train_predictions = knn.predict(X_train)
    for i in range(len(train_predictions)):
        print(f"Fold {fold + 1} (Train), File: {df['File Name'].iloc[train_index[i]]}, Predicted Label: {train_predictions[i]}, True Label: {y_train.iloc[i]}")

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    for i in range(len(test_predictions)):
        print(f"Fold {fold + 1} (Test), File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    print(f"Fold {fold + 1}: Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 52/52 [10:05<00:00, 11.65s/it]   


Fold 1 (Train), File: nons (29).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (25).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (25).java, Predicted Label: 0, True Label: 1
Fold 1 (Train), File: nons (42).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (18).java, Predicted Label: 1, True Label: 1
Fold 1 (Train), File: nons (34).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (49).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (9).java, Predicted Label: 1, True Label: 1
Fold 1 (Train), File: nons (44).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (38).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (14).java, Predicted Label: 0, True Label: 1
Fold 1 (Train), File: nons (33).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (41).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (1).java, Pre

In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin1.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 52/52 [09:51<00:00, 11.38s/it]   
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (50).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (30).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (39).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (13).java, Predicted Label: 1, True Label: 1
File: nons (48).java, Predicted Label: 0, True Label: 0
File: nons (26).java, Predicted Label: 0, True Label: 0
File: nons (47).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 10 (Test):
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (35).java, Predicted Label: 0, True Label: 0
File: nons (43).java, Predicted Label: 0, True Label: 0
File: singl

In [None]:
#Singleton with different settings

In [None]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin2.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to embeddings/sin2.csv


In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin2.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [03:10<00:00,  3.82s/it]


Fold 4 (Test):
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: singleton (3).java, Predicted Label: 0, True Label: 1
File: nons (16).java, Predicted Label: 0, True Label: 0
File: nons (6).java, Predicted Label: 0, True Label: 0
Precision: 0.85, Recall: 0.8, F-score: 0.7809523809523808
Fold 8 (Test):
File: nons (29).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (9).java, Predicted Label: 0, True Label: 0
File: singleton (5).java, Predicted Label: 0, True Label: 1
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 9 (Test):
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (4).java, Predicted Label: 1, True Label: 1
File: singleton (17).java, Predicted Label: 1, True Label: 1
File: singleton (7).java, Predicted Label: 1, True Label: 1
File: nons (45).java, P

In [None]:
#Singleton with different settings

In [None]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin3.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to embeddings/sin3.csv


In [None]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin3.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 52/52 [03:22<00:00,  3.89s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (25).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (53).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (64).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 6 (Test):
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (41).java, Predicted Label: 0, True Label: 0
File: singleton (11).java, Predicted Label: 1, True Label: 1
File: singleton (20).java, Predicted Label: 1, True Label: 1
File: nons (28).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 9 (Test):
File: nons (42).java, Predicted Label: 1, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (22).java, Predicted Label: 1, True Label: 1
File: singl

In [None]:
#Singleton with different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin4.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin4.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 55/55 [03:27<00:00,  3.76s/it]


Fold 3 (Test):
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: singleton (3).java, Predicted Label: 1, True Label: 1
File: nons (16).java, Predicted Label: 0, True Label: 0
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (51).java, Predicted Label: 0, True Label: 0
File: nons (58).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 6 (Test):
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (48).java, Predicted Label: 0, True Label: 0
File: singleton (20).java, Predicted Label: 1, True Label: 1
File: nons (52).java, Predicted Label: 0, True Label: 0
File: singleton (2).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 7 (Test):
File: nons (21).java, Predicted Label: 0, True Label: 0
File: nons (10).java, Predicted Label: 0, True Label: 0
File: singleton (4).java, Predicted Label: 0, True Label: 1
File: nons (60).java, Predict

In [None]:
#Singleton using different setting

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin5.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin5.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.5 and recall >= 0.5 and f1 >= 0.5:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 49/49 [03:17<00:00,  4.03s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 2 (Test):
File: singleton (15).java, Predicted Label: 1, True Label: 1
File: nons (63).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: nons (67).java, Predicted Label: 0, True Label: 0
File: nons (58).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 3 (Test):
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (53).java, Predicted Label: 0, True Label: 0
File: nons (14).java, Predicted Label: 1, True Label: 0
File: singleton (22).java, Predicted Label: 1, True Label: 1
File: nons (15).java, Predicted Label: 0, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 6 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (65).java, Predicted Label: 0, True Label: 0
File: singleton (2).java, Pre

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin6.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin6.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.5 and recall >= 0.5 and f1 >= 0.5:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:07<00:00,  3.67s/it]


Fold 1 (Test):
File: nons (50).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (63).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 0, True Label: 1
File: nons (64).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 2 (Test):
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (61).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (46).java, Predicted Label: 1, True Label: 0
File: nons (45).java, Predicted Label: 0, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (47).java, Predicted Label: 0, True Label: 0
File: nons (60).java, Predicted Label: 0, True Label: 0
File: single

In [None]:
#Singleton with different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin7.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin7.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.5 and recall >= 0.5 and f1 >= 0.5:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin7.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 53/53 [04:45<00:00,  5.38s/it]


Fold 1 (Test):
File: nons (54).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (65).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 2 (Test):
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (6).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: singleton (5).java, Predicted Label: 0, True Label: 1
File: nons (24).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (13).java, Predicted Label: 1, T

In [None]:
#Singleton with different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin8.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin8.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.4 and recall >= 0.4 and f1 >= 0.4:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin8.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [04:31<00:00,  5.33s/it]


Fold 1 (Test):
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (24).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 2 (Test):
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (6).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (8).java, Predicted Label: 0, True Label: 0
File: nons (21).java, Predicted Label: 0, True Label: 0
File: nons (60).java, Predicted Label: 0, True Label: 0
File:

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin9.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin9.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin9.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:32<00:00,  4.17s/it]


Fold 1 (Test):
File: nons (12).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (1).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (32).java, Predicted Label: 1, True Label: 0
Precision: 0.7999999999999999, Recall: 0.6666666666666666, F-score: 0.6249999999999999
Fold 2 (Test):
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (40).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 0, True Label: 1
File: nons (15).java, Predicted Label: 0, True Label: 0
Precision: 0.6, Recall: 0.6, F-score: 0.6
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (11).java, Predicted Label: 0, True Label: 0
File: nons (26).java, Predicted Label: 0, True Label: 0
File: nons (21)

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin10.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin10.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin10.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [04:37<00:00,  5.14s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 0, True Label: 0
File: singleton (17).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 2 (Test):
File: nons (68).java, Predicted Label: 0, True Label: 0
File: singleton (16).java, Predicted Label: 0, True Label: 1
File: nons (26).java, Predicted Label: 1, True Label: 0
File: singleton (4).java, Predicted Label: 0, True Label: 1
File: nons (46).java, Predicted Label: 0, True Label: 0
File: singleton (2).java, Predicted Label: 1, True Label: 1
Precision: 0.5, Recall: 0.5, F-score: 0.48571428571428577
Fold 3 (Test):
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (3).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, Predicted Labe

In [None]:
#Singleton using different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin11.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin11.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin11.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [03:23<00:00,  4.07s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (18).java, Predicted Label: 1, True Label: 0
File: nons (22).java, Predicted Label: 1, True Label: 0
File: singleton (11).java, Predicted Label: 1, True Label: 1
File: singleton (12).java, Predicted Label: 0, True Label: 1
File: nons (39).java, Predicted Label: 0, True Label: 0
Precision: 0.4333333333333333, Recall: 0.4, F-score: 0.4
Fold 2 (Test):
File: nons (12).java, Predicted Label: 1, True Label: 0
File: singleton (15).java, Predicted Label: 1, True Label: 1
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: nons (13).java, Predicted Label: 1, True Label: 0
File: nons (23).java, Predicted Label: 1, True Label: 0
Precision: 0.16, Recall: 0.4, F-score: 0.2285714285714286
Fold 3 (Test):
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (63).java, Predicted Label: 0, True Label: 0
File: nons (8).java, Predicted Label: 1, True Label: 0
File: singleton (22).java, Predicted Label: 1, True Label: 1
File: nons (60).java, Predict

In [None]:
#Singleton with different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin12.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin12.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin12.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:28<00:00,  4.09s/it]


Fold 1 (Test):
File: nons (12).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (2).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 2 (Test):
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: nons (21).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (15).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (26).java, Predicted Label: 1, True Label: 0
File: nons (8).java, Predicted Label: 0, True Label: 0
File: nons (13).java, Predicted Label: 0, True Label: 0
File: 

In [None]:
#Singleton using different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin13.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin13.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin13.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:23<00:00,  4.00s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 1, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 1, True Label: 0
Precision: 0.7999999999999999, Recall: 0.6666666666666666, F-score: 0.6249999999999999
Fold 2 (Test):
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (68).java, Predicted Label: 0, True Label: 0
File: nons (67).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (32).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (26).java, Predicted Label: 1, True Label: 0
File: nons (66).java, Predicted Label: 1, True Label: 0

In [None]:
#Abstract Factory with different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "abstractfactory" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/af1.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/af1.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/af1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 33/33 [02:42<00:00,  4.93s/it]

Fold 1 (Test):
File: nonab (72).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (67).java, Predicted Label: 0, True Label: 0
File: nonab (12).java, Predicted Label: 0, True Label: 0
File: abstractfactory (6).java, Predicted Label: 0, True Label: 1
Precision: 0.8857142857142858, Recall: 0.8571428571428571, F-score: 0.8507936507936508
Fold 2 (Test):
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (1).java, Predicted Label: 1, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
File: nonab (74).java, Predicted Label: 0, True Label: 0
File: nonab (9).java, Predicted Label: 0, True Label: 0
Precision: 0.8928571428571429, Re




In [None]:
#Abstract Factory using different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "abstractfactory" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/af2.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/af2.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/af2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 32/32 [02:20<00:00,  4.38s/it]

Fold 1 (Test):
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (6).java, Predicted Label: 0, True Label: 0
File: nonab (16).java, Predicted Label: 0, True Label: 0
File: abstractfactory (6).java, Predicted Label: 0, True Label: 1
Precision: 0.7142857142857143, Recall: 0.7142857142857143, F-score: 0.7142857142857143
Fold 2 (Test):
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (10).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
File: nonab (12).java, Predicted Label: 0, True Label: 0
File: abstractfactory (14).java, Predicted Label: 0, True Label: 1
Precision: 0.8928571428




In [None]:
#Abstract Factory using different settings

In [None]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "abstractfactory" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/af3.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/af3.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 2
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/af3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 32/32 [03:27<00:00,  6.47s/it]

Fold 1 (Test):
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: nonab (48).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 0, True Label: 1
File: abstractfactory (16).java, Predicted Label: 0, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 0, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (3).java, Predicted Label: 0, True Label: 1
File: nonab (4).java, Predicted Label: 0, True Label: 0
File: abstractfactory (8).java, Predicted Label: 0, True Label: 1
File: abstractfactory (14).java, Predicted Label: 0, True Label: 1
File: nonab (63).java, Predicted Label: 0, True Label: 0
File: nonab (17).java, Predicted Label: 1, True Label: 0
File: nonab (18).java, Predicted Label: 0, True Label: 0
File: abstractfactory (6).java, Predicted Label: 0, True Label: 1
Pr




**Time calculation for abstract factory**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/builder'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "builder" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)  # Move model to device

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU for NumPy operations

# Calculate embeddings for each program by considering embeddings of individual lines
def calculate_program_embeddings(df):
    program_embeddings = []
    for content in tqdm(df['Content']):
        lines = content.split('\n')
        line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
        program_embeddings.append(np.mean(line_embeddings, axis=0))
    return np.vstack(program_embeddings)

# Function to randomly select negative examples to ensure 40%-60% class balance
def balance_classes(df, neg_label=0, pos_label=1, min_ratio=0.4, max_ratio=0.6):
    pos_examples = df[df['Label'] == pos_label]
    neg_examples = df[df['Label'] == neg_label]

    total_pos = len(pos_examples)
    available_neg = len(neg_examples)

    # Ensure the target_neg_range does not exceed available negative examples
    target_neg_min = min(int(total_pos / max_ratio) - total_pos, available_neg)
    target_neg_max = min(int(total_pos / min_ratio) - total_pos, available_neg)

    # If there are not enough negative examples, we use all of them
    if target_neg_max <= 0:
        neg_selected = neg_examples
    else:
        neg_selected = neg_examples.sample(random.randint(target_neg_min, target_neg_max))

    return pd.concat([pos_examples, neg_selected])

# Perform the experiment 10 times and record training and prediction times
n_splits = 2
total_training_time = []
total_prediction_time = []

for experiment in range(10):
    print(f"Experiment {experiment + 1}")

    # Ensure class balance
    balanced_df = balance_classes(df)

    # Measure training time (includes embedding extraction + KNN fitting)
    start_time_train = time.time()

    # Get embeddings for balanced data
    balanced_embeddings = calculate_program_embeddings(balanced_df)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_train, X_test = balanced_embeddings[train_index], balanced_embeddings[test_index]
        y_train, y_test = balanced_df['Label'].iloc[train_index], balanced_df['Label'].iloc[test_index]

        # Apply k-nearest neighbor classification
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)

    end_time_train = time.time()
    training_time = (end_time_train - start_time_train) * 1000  # Convert to milliseconds
    total_training_time.append(training_time)

    # Measure prediction time
    start_time_pred = time.time()

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_test = balanced_embeddings[test_index]
        test_predictions = knn.predict(X_test)

    end_time_pred = time.time()
    prediction_time = (end_time_pred - start_time_pred) * 1000  # Convert to milliseconds
    total_prediction_time.append(prediction_time)

    # Print the times for the current experiment
    print(f"Training Time (ms): {training_time}, Prediction Time (ms): {prediction_time}")

# Calculate mean times over all 10 experiments
mean_training_time = np.mean(total_training_time)
mean_prediction_time = np.mean(total_prediction_time)

print(f"Mean Training Time (ms): {mean_training_time}")
print(f"Mean Prediction Time (ms): {mean_prediction_time}")


Using device: cuda
Labels and contents saved to embeddings/af3.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment 1


100%|██████████| 19/19 [00:50<00:00,  2.64s/it]


Training Time (ms): 50180.77802658081, Prediction Time (ms): 49.419403076171875
Experiment 2


100%|██████████| 17/17 [00:44<00:00,  2.61s/it]


Training Time (ms): 44464.502573013306, Prediction Time (ms): 3.0274391174316406
Experiment 3


100%|██████████| 21/21 [00:55<00:00,  2.62s/it]


Training Time (ms): 55038.7864112854, Prediction Time (ms): 2.6955604553222656
Experiment 4


100%|██████████| 15/15 [00:41<00:00,  2.75s/it]


Training Time (ms): 41219.37155723572, Prediction Time (ms): 2.8960704803466797
Experiment 5


100%|██████████| 20/20 [00:52<00:00,  2.63s/it]


Training Time (ms): 52547.621726989746, Prediction Time (ms): 2.8939247131347656
Experiment 6


100%|██████████| 17/17 [00:46<00:00,  2.73s/it]


Training Time (ms): 46439.0606880188, Prediction Time (ms): 3.0426979064941406
Experiment 7


100%|██████████| 18/18 [00:48<00:00,  2.70s/it]


Training Time (ms): 48687.175273895264, Prediction Time (ms): 2.6514530181884766
Experiment 8


100%|██████████| 17/17 [00:40<00:00,  2.35s/it]


Training Time (ms): 40035.76612472534, Prediction Time (ms): 2.8231143951416016
Experiment 9


100%|██████████| 17/17 [00:43<00:00,  2.58s/it]


Training Time (ms): 43836.21883392334, Prediction Time (ms): 3.2112598419189453
Experiment 10


100%|██████████| 22/22 [00:50<00:00,  2.28s/it]

Training Time (ms): 50130.319356918335, Prediction Time (ms): 4.4116973876953125
Mean Training Time (ms): 47257.960057258606
Mean Prediction Time (ms): 7.70726203918457





**Builder time calculation**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/builder'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "builder" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)  # Move model to device

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU for NumPy operations

# Calculate embeddings for each program by considering embeddings of individual lines
def calculate_program_embeddings(df):
    program_embeddings = []
    for content in tqdm(df['Content']):
        lines = content.split('\n')
        line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
        program_embeddings.append(np.mean(line_embeddings, axis=0))
    return np.vstack(program_embeddings)

# Function to randomly select negative examples to ensure 40%-60% class balance
def balance_classes(df, neg_label=0, pos_label=1, min_ratio=0.4, max_ratio=0.6):
    pos_examples = df[df['Label'] == pos_label]
    neg_examples = df[df['Label'] == neg_label]

    total_pos = len(pos_examples)
    available_neg = len(neg_examples)

    # Ensure the target_neg_range does not exceed available negative examples
    target_neg_min = min(int(total_pos / max_ratio) - total_pos, available_neg)
    target_neg_max = min(int(total_pos / min_ratio) - total_pos, available_neg)

    # If there are not enough negative examples, we use all of them
    if target_neg_max <= 0:
        neg_selected = neg_examples
    else:
        neg_selected = neg_examples.sample(random.randint(target_neg_min, target_neg_max))

    return pd.concat([pos_examples, neg_selected])

# Perform the experiment 10 times and record training and prediction times
n_splits = 2
total_training_time = []
total_prediction_time = []

for experiment in range(10):
    print(f"Experiment {experiment + 1}")

    # Ensure class balance
    balanced_df = balance_classes(df)

    # Measure training time (includes embedding extraction + KNN fitting)
    start_time_train = time.time()

    # Get embeddings for balanced data
    balanced_embeddings = calculate_program_embeddings(balanced_df)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_train, X_test = balanced_embeddings[train_index], balanced_embeddings[test_index]
        y_train, y_test = balanced_df['Label'].iloc[train_index], balanced_df['Label'].iloc[test_index]

        # Apply k-nearest neighbor classification
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)

    end_time_train = time.time()
    training_time = (end_time_train - start_time_train) * 1000  # Convert to milliseconds
    total_training_time.append(training_time)

    # Measure prediction time
    start_time_pred = time.time()

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_test = balanced_embeddings[test_index]
        test_predictions = knn.predict(X_test)

    end_time_pred = time.time()
    prediction_time = (end_time_pred - start_time_pred) * 1000  # Convert to milliseconds
    total_prediction_time.append(prediction_time)

    # Print the times for the current experiment
    print(f"Training Time (ms): {training_time}, Prediction Time (ms): {prediction_time}")

# Calculate mean times over all 10 experiments
mean_training_time = np.mean(total_training_time)
mean_prediction_time = np.mean(total_prediction_time)

print(f"Mean Training Time (ms): {mean_training_time}")
print(f"Mean Prediction Time (ms): {mean_prediction_time}")


Using device: cuda
Labels and contents saved to embeddings/af3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment 1


100%|██████████| 17/17 [01:04<00:00,  3.80s/it]


Training Time (ms): 64701.75576210022, Prediction Time (ms): 3.0031204223632812
Experiment 2


100%|██████████| 20/20 [00:54<00:00,  2.73s/it]


Training Time (ms): 54584.04278755188, Prediction Time (ms): 3.2501220703125
Experiment 3


100%|██████████| 20/20 [00:46<00:00,  2.34s/it]


Training Time (ms): 46840.06667137146, Prediction Time (ms): 2.9244422912597656
Experiment 4


100%|██████████| 18/18 [00:48<00:00,  2.67s/it]


Training Time (ms): 48072.8554725647, Prediction Time (ms): 2.913236618041992
Experiment 5


100%|██████████| 15/15 [00:40<00:00,  2.71s/it]


Training Time (ms): 40735.3196144104, Prediction Time (ms): 2.8839111328125
Experiment 6


100%|██████████| 18/18 [00:44<00:00,  2.50s/it]


Training Time (ms): 44940.94491004944, Prediction Time (ms): 2.8486251831054688
Experiment 7


100%|██████████| 19/19 [00:49<00:00,  2.61s/it]


Training Time (ms): 49685.22334098816, Prediction Time (ms): 2.958536148071289
Experiment 8


100%|██████████| 22/22 [00:54<00:00,  2.47s/it]


Training Time (ms): 54327.791690826416, Prediction Time (ms): 2.997159957885742
Experiment 9


100%|██████████| 18/18 [00:44<00:00,  2.49s/it]


Training Time (ms): 44851.84168815613, Prediction Time (ms): 3.690004348754883
Experiment 10


100%|██████████| 22/22 [00:47<00:00,  2.14s/it]

Training Time (ms): 47184.78727340698, Prediction Time (ms): 3.6110877990722656
Mean Training Time (ms): 49592.46292114258
Mean Prediction Time (ms): 3.1080245971679688





**Singleton time calculation**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)  # Move model to device

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU for NumPy operations

# Calculate embeddings for each program by considering embeddings of individual lines
def calculate_program_embeddings(df):
    program_embeddings = []
    for content in tqdm(df['Content']):
        lines = content.split('\n')
        line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
        program_embeddings.append(np.mean(line_embeddings, axis=0))
    return np.vstack(program_embeddings)

# Function to randomly select negative examples to ensure 40%-60% class balance
def balance_classes(df, neg_label=0, pos_label=1, min_ratio=0.4, max_ratio=0.6):
    pos_examples = df[df['Label'] == pos_label]
    neg_examples = df[df['Label'] == neg_label]

    total_pos = len(pos_examples)
    available_neg = len(neg_examples)

    # Ensure the target_neg_range does not exceed available negative examples
    target_neg_min = min(int(total_pos / max_ratio) - total_pos, available_neg)
    target_neg_max = min(int(total_pos / min_ratio) - total_pos, available_neg)

    # If there are not enough negative examples, we use all of them
    if target_neg_max <= 0:
        neg_selected = neg_examples
    else:
        neg_selected = neg_examples.sample(random.randint(target_neg_min, target_neg_max))

    return pd.concat([pos_examples, neg_selected])

# Perform the experiment 10 times and record training and prediction times
n_splits = 2
total_training_time = []
total_prediction_time = []

for experiment in range(10):
    print(f"Experiment {experiment + 1}")

    # Ensure class balance
    balanced_df = balance_classes(df)

    # Measure training time (includes embedding extraction + KNN fitting)
    start_time_train = time.time()

    # Get embeddings for balanced data
    balanced_embeddings = calculate_program_embeddings(balanced_df)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_train, X_test = balanced_embeddings[train_index], balanced_embeddings[test_index]
        y_train, y_test = balanced_df['Label'].iloc[train_index], balanced_df['Label'].iloc[test_index]

        # Apply k-nearest neighbor classification
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)

    end_time_train = time.time()
    training_time = (end_time_train - start_time_train) * 1000  # Convert to milliseconds
    total_training_time.append(training_time)

    # Measure prediction time
    start_time_pred = time.time()

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_test = balanced_embeddings[test_index]
        test_predictions = knn.predict(X_test)

    end_time_pred = time.time()
    prediction_time = (end_time_pred - start_time_pred) * 1000  # Convert to milliseconds
    total_prediction_time.append(prediction_time)

    # Print the times for the current experiment
    print(f"Training Time (ms): {training_time}, Prediction Time (ms): {prediction_time}")

# Calculate mean times over all 10 experiments
mean_training_time = np.mean(total_training_time)
mean_prediction_time = np.mean(total_prediction_time)

print(f"Mean Training Time (ms): {mean_training_time}")
print(f"Mean Prediction Time (ms): {mean_prediction_time}")


Using device: cuda
Labels and contents saved to embeddings/af3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment 1


100%|██████████| 53/53 [02:22<00:00,  2.69s/it]


Training Time (ms): 142347.58925437927, Prediction Time (ms): 3.0057430267333984
Experiment 2


100%|██████████| 62/62 [02:15<00:00,  2.19s/it]


Training Time (ms): 135824.35989379883, Prediction Time (ms): 3.3414363861083984
Experiment 3


100%|██████████| 51/51 [01:51<00:00,  2.19s/it]


Training Time (ms): 111864.23349380493, Prediction Time (ms): 4.061698913574219
Experiment 4


100%|██████████| 43/43 [01:28<00:00,  2.05s/it]


Training Time (ms): 88370.54944038391, Prediction Time (ms): 3.7276744842529297
Experiment 5


100%|██████████| 61/61 [02:11<00:00,  2.16s/it]


Training Time (ms): 131706.37011528015, Prediction Time (ms): 3.3135414123535156
Experiment 6


100%|██████████| 42/42 [01:31<00:00,  2.18s/it]


Training Time (ms): 91576.89547538757, Prediction Time (ms): 3.1740665435791016
Experiment 7


100%|██████████| 47/47 [01:44<00:00,  2.23s/it]


Training Time (ms): 104628.97801399231, Prediction Time (ms): 3.9985179901123047
Experiment 8


100%|██████████| 60/60 [02:09<00:00,  2.16s/it]


Training Time (ms): 129642.62127876282, Prediction Time (ms): 3.161907196044922
Experiment 9


100%|██████████| 46/46 [01:36<00:00,  2.10s/it]


Training Time (ms): 96553.04670333862, Prediction Time (ms): 2.9125213623046875
Experiment 10


100%|██████████| 47/47 [01:37<00:00,  2.08s/it]

Training Time (ms): 98009.46593284607, Prediction Time (ms): 3.9894580841064453
Mean Training Time (ms): 113052.41096019745
Mean Prediction Time (ms): 3.468656539916992





**Prototype time calculation**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)  # Move model to device

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU for NumPy operations

# Calculate embeddings for each program by considering embeddings of individual lines
def calculate_program_embeddings(df):
    program_embeddings = []
    for content in tqdm(df['Content']):
        lines = content.split('\n')
        line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
        program_embeddings.append(np.mean(line_embeddings, axis=0))
    return np.vstack(program_embeddings)

# Function to randomly select negative examples to ensure 40%-60% class balance
def balance_classes(df, neg_label=0, pos_label=1, min_ratio=0.4, max_ratio=0.6):
    pos_examples = df[df['Label'] == pos_label]
    neg_examples = df[df['Label'] == neg_label]

    total_pos = len(pos_examples)
    available_neg = len(neg_examples)

    # Ensure the target_neg_range does not exceed available negative examples
    target_neg_min = min(int(total_pos / max_ratio) - total_pos, available_neg)
    target_neg_max = min(int(total_pos / min_ratio) - total_pos, available_neg)

    # If there are not enough negative examples, we use all of them
    if target_neg_max <= 0:
        neg_selected = neg_examples
    else:
        neg_selected = neg_examples.sample(random.randint(target_neg_min, target_neg_max))

    return pd.concat([pos_examples, neg_selected])

# Perform the experiment 10 times and record training and prediction times
n_splits = 2
total_training_time = []
total_prediction_time = []

for experiment in range(10):
    print(f"Experiment {experiment + 1}")

    # Ensure class balance
    balanced_df = balance_classes(df)

    # Measure training time (includes embedding extraction + KNN fitting)
    start_time_train = time.time()

    # Get embeddings for balanced data
    balanced_embeddings = calculate_program_embeddings(balanced_df)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_train, X_test = balanced_embeddings[train_index], balanced_embeddings[test_index]
        y_train, y_test = balanced_df['Label'].iloc[train_index], balanced_df['Label'].iloc[test_index]

        # Apply k-nearest neighbor classification
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)

    end_time_train = time.time()
    training_time = (end_time_train - start_time_train) * 1000  # Convert to milliseconds
    total_training_time.append(training_time)

    # Measure prediction time
    start_time_pred = time.time()

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_test = balanced_embeddings[test_index]
        test_predictions = knn.predict(X_test)

    end_time_pred = time.time()
    prediction_time = (end_time_pred - start_time_pred) * 1000  # Convert to milliseconds
    total_prediction_time.append(prediction_time)

    # Print the times for the current experiment
    print(f"Training Time (ms): {training_time}, Prediction Time (ms): {prediction_time}")

# Calculate mean times over all 10 experiments
mean_training_time = np.mean(total_training_time)
mean_prediction_time = np.mean(total_prediction_time)

print(f"Mean Training Time (ms): {mean_training_time}")
print(f"Mean Prediction Time (ms): {mean_prediction_time}")


Using device: cuda
Labels and contents saved to embeddings/af3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment 1


100%|██████████| 54/54 [01:59<00:00,  2.20s/it]


Training Time (ms): 119029.86717224121, Prediction Time (ms): 3.0078887939453125
Experiment 2


100%|██████████| 62/62 [02:03<00:00,  1.99s/it]


Training Time (ms): 123586.41910552979, Prediction Time (ms): 5.185127258300781
Experiment 3


100%|██████████| 47/47 [01:45<00:00,  2.25s/it]


Training Time (ms): 105988.78383636475, Prediction Time (ms): 3.036022186279297
Experiment 4


100%|██████████| 45/45 [01:38<00:00,  2.19s/it]


Training Time (ms): 98588.70053291321, Prediction Time (ms): 3.8678646087646484
Experiment 5


100%|██████████| 51/51 [01:42<00:00,  2.01s/it]


Training Time (ms): 102734.8051071167, Prediction Time (ms): 2.7723312377929688
Experiment 6


100%|██████████| 47/47 [01:41<00:00,  2.15s/it]


Training Time (ms): 101191.24937057495, Prediction Time (ms): 2.7930736541748047
Experiment 7


100%|██████████| 48/48 [01:38<00:00,  2.05s/it]


Training Time (ms): 98478.86109352112, Prediction Time (ms): 2.5932788848876953
Experiment 8


100%|██████████| 60/60 [02:01<00:00,  2.02s/it]


Training Time (ms): 121365.70239067078, Prediction Time (ms): 2.772808074951172
Experiment 9


100%|██████████| 52/52 [01:44<00:00,  2.01s/it]


Training Time (ms): 104584.93423461914, Prediction Time (ms): 2.7954578399658203
Experiment 10


100%|██████████| 44/44 [01:36<00:00,  2.18s/it]

Training Time (ms): 96128.29899787903, Prediction Time (ms): 5.895853042602539
Mean Training Time (ms): 107167.76218414307
Mean Prediction Time (ms): 3.471970558166504





**Factory method time calculation**



In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/factorymethod'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "factorymethod" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)  # Move model to device

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU for NumPy operations

# Calculate embeddings for each program by considering embeddings of individual lines
def calculate_program_embeddings(df):
    program_embeddings = []
    for content in tqdm(df['Content']):
        lines = content.split('\n')
        line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
        program_embeddings.append(np.mean(line_embeddings, axis=0))
    return np.vstack(program_embeddings)

# Function to randomly select negative examples to ensure 40%-60% class balance
def balance_classes(df, neg_label=0, pos_label=1, min_ratio=0.4, max_ratio=0.6):
    pos_examples = df[df['Label'] == pos_label]
    neg_examples = df[df['Label'] == neg_label]

    total_pos = len(pos_examples)
    available_neg = len(neg_examples)

    # Ensure the target_neg_range does not exceed available negative examples
    target_neg_min = min(int(total_pos / max_ratio) - total_pos, available_neg)
    target_neg_max = min(int(total_pos / min_ratio) - total_pos, available_neg)

    # If there are not enough negative examples, we use all of them
    if target_neg_max <= 0:
        neg_selected = neg_examples
    else:
        neg_selected = neg_examples.sample(random.randint(target_neg_min, target_neg_max))

    return pd.concat([pos_examples, neg_selected])

# Perform the experiment 10 times and record training and prediction times
n_splits = 2
total_training_time = []
total_prediction_time = []

for experiment in range(10):
    print(f"Experiment {experiment + 1}")

    # Ensure class balance
    balanced_df = balance_classes(df)

    # Measure training time (includes embedding extraction + KNN fitting)
    start_time_train = time.time()

    # Get embeddings for balanced data
    balanced_embeddings = calculate_program_embeddings(balanced_df)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_train, X_test = balanced_embeddings[train_index], balanced_embeddings[test_index]
        y_train, y_test = balanced_df['Label'].iloc[train_index], balanced_df['Label'].iloc[test_index]

        # Apply k-nearest neighbor classification
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)

    end_time_train = time.time()
    training_time = (end_time_train - start_time_train) * 1000  # Convert to milliseconds
    total_training_time.append(training_time)

    # Measure prediction time
    start_time_pred = time.time()

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_test = balanced_embeddings[test_index]
        test_predictions = knn.predict(X_test)

    end_time_pred = time.time()
    prediction_time = (end_time_pred - start_time_pred) * 1000  # Convert to milliseconds
    total_prediction_time.append(prediction_time)

    # Print the times for the current experiment
    print(f"Training Time (ms): {training_time}, Prediction Time (ms): {prediction_time}")

# Calculate mean times over all 10 experiments
mean_training_time = np.mean(total_training_time)
mean_prediction_time = np.mean(total_prediction_time)

print(f"Mean Training Time (ms): {mean_training_time}")
print(f"Mean Prediction Time (ms): {mean_prediction_time}")


Using device: cuda
Labels and contents saved to embeddings/af3.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment 1


100%|██████████| 20/20 [00:59<00:00,  2.99s/it]


Training Time (ms): 59900.19679069519, Prediction Time (ms): 53.635358810424805
Experiment 2


100%|██████████| 21/21 [00:45<00:00,  2.16s/it]


Training Time (ms): 45359.7092628479, Prediction Time (ms): 2.72369384765625
Experiment 3


100%|██████████| 20/20 [00:41<00:00,  2.06s/it]


Training Time (ms): 41214.684009552, Prediction Time (ms): 2.4390220642089844
Experiment 4


100%|██████████| 24/24 [00:48<00:00,  2.04s/it]


Training Time (ms): 48936.90490722656, Prediction Time (ms): 2.44140625
Experiment 5


100%|██████████| 19/19 [00:42<00:00,  2.26s/it]


Training Time (ms): 42968.46079826355, Prediction Time (ms): 4.642486572265625
Experiment 6


100%|██████████| 19/19 [00:41<00:00,  2.19s/it]


Training Time (ms): 41611.92464828491, Prediction Time (ms): 2.4209022521972656
Experiment 7


100%|██████████| 19/19 [00:38<00:00,  2.01s/it]


Training Time (ms): 38200.06275177002, Prediction Time (ms): 2.389192581176758
Experiment 8


100%|██████████| 27/27 [00:50<00:00,  1.87s/it]


Training Time (ms): 50483.14118385315, Prediction Time (ms): 2.547025680541992
Experiment 9


100%|██████████| 20/20 [00:39<00:00,  1.99s/it]


Training Time (ms): 39801.35083198547, Prediction Time (ms): 2.5534629821777344
Experiment 10


100%|██████████| 22/22 [00:42<00:00,  1.92s/it]

Training Time (ms): 42189.897775650024, Prediction Time (ms): 3.662109375
Mean Training Time (ms): 45066.63329601288
Mean Prediction Time (ms): 7.945466041564941





In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/factorymethod'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "factorymethod" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)  # Move model to device

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU for NumPy operations

# Calculate embeddings for each program by considering embeddings of individual lines
def calculate_program_embeddings(df):
    program_embeddings = []
    for content in tqdm(df['Content']):
        lines = content.split('\n')
        line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
        program_embeddings.append(np.mean(line_embeddings, axis=0))
    return np.vstack(program_embeddings)

# Function to randomly select negative examples to ensure 40%-60% class balance
def balance_classes(df, neg_label=0, pos_label=1, min_ratio=0.4, max_ratio=0.6):
    pos_examples = df[df['Label'] == pos_label]
    neg_examples = df[df['Label'] == neg_label]

    total_pos = len(pos_examples)
    available_neg = len(neg_examples)

    # Ensure the target_neg_range does not exceed available negative examples
    target_neg_min = min(int(total_pos / max_ratio) - total_pos, available_neg)
    target_neg_max = min(int(total_pos / min_ratio) - total_pos, available_neg)

    # If there are not enough negative examples, we use all of them
    if target_neg_max <= 0:
        neg_selected = neg_examples
    else:
        neg_selected = neg_examples.sample(random.randint(target_neg_min, target_neg_max))

    return pd.concat([pos_examples, neg_selected])

# Perform the experiment 10 times and record training and prediction times
n_splits = 2
total_training_time = []
total_prediction_time = []

for experiment in range(10):
    print(f"Experiment {experiment + 1}")

    # Ensure class balance
    balanced_df = balance_classes(df)

    # Measure training time (includes embedding extraction + KNN fitting)
    start_time_train = time.time()

    # Get embeddings for balanced data
    balanced_embeddings = calculate_program_embeddings(balanced_df)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_train, X_test = balanced_embeddings[train_index], balanced_embeddings[test_index]
        y_train, y_test = balanced_df['Label'].iloc[train_index], balanced_df['Label'].iloc[test_index]

        # Apply k-nearest neighbor classification
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)

    end_time_train = time.time()
    training_time = (end_time_train - start_time_train) * 1000  # Convert to milliseconds
    total_training_time.append(training_time)

    # Measure prediction time
    start_time_pred = time.time()

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_test = balanced_embeddings[test_index]
        test_predictions = knn.predict(X_test)

    end_time_pred = time.time()
    prediction_time = (end_time_pred - start_time_pred) * 1000  # Convert to milliseconds
    total_prediction_time.append(prediction_time)

    # Print the times for the current experiment
    print(f"Training Time (ms): {training_time}, Prediction Time (ms): {prediction_time}")

# Calculate mean times over all 10 experiments
mean_training_time = np.mean(total_training_time)
mean_prediction_time = np.mean(total_prediction_time)

print(f"Mean Training Time (ms): {mean_training_time}")
print(f"Mean Prediction Time (ms): {mean_prediction_time}")


Using device: cuda
Labels and contents saved to embeddings/af3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment 1


100%|██████████| 18/18 [01:09<00:00,  3.84s/it]


Training Time (ms): 69096.7800617218, Prediction Time (ms): 3.520488739013672
Experiment 2


100%|██████████| 27/27 [00:54<00:00,  2.02s/it]


Training Time (ms): 54662.1150970459, Prediction Time (ms): 3.576040267944336
Experiment 3


100%|██████████| 20/20 [00:39<00:00,  1.97s/it]


Training Time (ms): 39393.44811439514, Prediction Time (ms): 2.4728775024414062
Experiment 4


100%|██████████| 19/19 [00:39<00:00,  2.09s/it]


Training Time (ms): 39731.14824295044, Prediction Time (ms): 2.4645328521728516
Experiment 5


100%|██████████| 24/24 [00:44<00:00,  1.84s/it]


Training Time (ms): 44150.39658546448, Prediction Time (ms): 2.5043487548828125
Experiment 6


100%|██████████| 22/22 [00:44<00:00,  2.04s/it]


Training Time (ms): 44990.877628326416, Prediction Time (ms): 3.30352783203125
Experiment 7


100%|██████████| 21/21 [00:42<00:00,  2.02s/it]


Training Time (ms): 42398.306131362915, Prediction Time (ms): 2.562999725341797
Experiment 8


100%|██████████| 19/19 [00:36<00:00,  1.93s/it]


Training Time (ms): 36648.98586273193, Prediction Time (ms): 7.267951965332031
Experiment 9


100%|██████████| 21/21 [00:41<00:00,  1.97s/it]


Training Time (ms): 41398.94890785217, Prediction Time (ms): 2.3818016052246094
Experiment 10


100%|██████████| 25/25 [00:45<00:00,  1.83s/it]

Training Time (ms): 45648.772954940796, Prediction Time (ms): 2.5293827056884766
Mean Training Time (ms): 45811.9779586792
Mean Prediction Time (ms): 3.258395195007324





**Calculation of Silhouette Scor Davies-Bouldin Index**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, silhouette_score, davies_bouldin_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/design_patterns'  # Folder containing all design pattern files

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs based on the file name for five design patterns
def label_program(file_name):
    file_name_lower = file_name.lower()
    if "factorymethod" in file_name_lower:
        return 1  # Factory Method
    elif "singleton" in file_name_lower:
        return 2  # Singleton
    elif "builder" in file_name_lower:
        return 3  # Builder
    elif "prototype" in file_name_lower:
        return 4  # Prototype
    elif "abstractfactory" in file_name_lower:
        return 5  # Abstract Factory
    else:
        return 0  # Negative class (no pattern)

# Load Java programs from the folder and classify them based on the pattern
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'design_patterns_data.csv')
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)  # Move model to device

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Move back to CPU for NumPy operations

# Calculate embeddings for each program by considering embeddings of individual lines
def calculate_program_embeddings(df):
    program_embeddings = []
    for content in tqdm(df['Content']):
        lines = content.split('\n')
        line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
        program_embeddings.append(np.mean(line_embeddings, axis=0))
    return np.vstack(program_embeddings)

# Function to randomly select negative examples and maintain class balance across all design patterns
def balance_classes(df, neg_label=0, pos_labels=[1, 2, 3, 4, 5], min_ratio=0.4, max_ratio=0.6):
    pos_examples = df[df['Label'].isin(pos_labels)]
    neg_examples = df[df['Label'] == neg_label]

    total_pos = len(pos_examples)
    available_neg = len(neg_examples)

    # Ensure the target_neg_range does not exceed available negative examples
    target_neg_min = min(int(total_pos / max_ratio) - total_pos, available_neg)
    target_neg_max = min(int(total_pos / min_ratio) - total_pos, available_neg)

    # If there are not enough negative examples, use all of them
    if target_neg_max <= 0:
        neg_selected = neg_examples
    else:
        neg_selected = neg_examples.sample(random.randint(target_neg_min, target_neg_max))

    return pd.concat([pos_examples, neg_selected])

# Perform the experiment 10 times and record training and prediction times
n_splits = 2
total_training_time = []
total_prediction_time = []

for experiment in range(10):
    print(f"Experiment {experiment + 1}")

    # Ensure class balance across the five design patterns and the negative class
    balanced_df = balance_classes(df)

    # Measure training time (includes embedding extraction + KNN fitting)
    start_time_train = time.time()

    # Get embeddings for balanced data
    balanced_embeddings = calculate_program_embeddings(balanced_df)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_train, X_test = balanced_embeddings[train_index], balanced_embeddings[test_index]
        y_train, y_test = balanced_df['Label'].iloc[train_index], balanced_df['Label'].iloc[test_index]

        # Apply k-nearest neighbor classification
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(X_train, y_train)

    end_time_train = time.time()
    training_time = (end_time_train - start_time_train) * 1000  # Convert to milliseconds
    total_training_time.append(training_time)

    # Measure prediction time
    start_time_pred = time.time()

    for fold, (train_index, test_index) in enumerate(skf.split(balanced_embeddings, balanced_df['Label'])):
        X_test = balanced_embeddings[test_index]
        test_predictions = knn.predict(X_test)

    end_time_pred = time.time()
    prediction_time = (end_time_pred - start_time_pred) * 1000  # Convert to milliseconds
    total_prediction_time.append(prediction_time)

    # Print the times for the current experiment
    print(f"Training Time (ms): {training_time}, Prediction Time (ms): {prediction_time}")

# Calculate mean times over all 10 experiments
mean_training_time = np.mean(total_training_time)
mean_prediction_time = np.mean(total_prediction_time)

print(f"Mean Training Time (ms): {mean_training_time}")
print(f"Mean Prediction Time (ms): {mean_prediction_time}")

# Calculate Silhouette Score and Davies-Bouldin Index for clustering quality
def calculate_clustering_metrics(embeddings, labels):
    silhouette_avg = silhouette_score(embeddings, labels)
    davies_bouldin = davies_bouldin_score(embeddings, labels)

    print(f"Silhouette Score: {silhouette_avg:.4f}")
    print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
    return silhouette_avg, davies_bouldin

# Perform clustering metrics calculations
silhouette_avg, davies_bouldin = calculate_clustering_metrics(balanced_embeddings, balanced_df['Label'])


Using device: cuda
Labels and contents saved to embeddings/design_patterns_data.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Experiment 1


100%|██████████| 93/93 [03:17<00:00,  2.12s/it]


Training Time (ms): 197148.0324268341, Prediction Time (ms): 50.83656311035156
Experiment 2


100%|██████████| 93/93 [03:17<00:00,  2.12s/it]


Training Time (ms): 197164.40987586975, Prediction Time (ms): 3.3011436462402344
Experiment 3


100%|██████████| 93/93 [03:10<00:00,  2.04s/it]


Training Time (ms): 190113.5904788971, Prediction Time (ms): 3.7598609924316406
Experiment 4


100%|██████████| 93/93 [03:10<00:00,  2.05s/it]


Training Time (ms): 190670.4478263855, Prediction Time (ms): 5.2032470703125
Experiment 5


100%|██████████| 93/93 [03:09<00:00,  2.04s/it]


Training Time (ms): 189408.48541259766, Prediction Time (ms): 3.118753433227539
Experiment 6


100%|██████████| 93/93 [03:09<00:00,  2.04s/it]


Training Time (ms): 189804.82840538025, Prediction Time (ms): 3.043651580810547
Experiment 7


100%|██████████| 93/93 [03:11<00:00,  2.06s/it]


Training Time (ms): 191277.34637260437, Prediction Time (ms): 3.047943115234375
Experiment 8


100%|██████████| 93/93 [03:10<00:00,  2.04s/it]


Training Time (ms): 190045.83501815796, Prediction Time (ms): 3.408670425415039
Experiment 9


100%|██████████| 93/93 [03:09<00:00,  2.03s/it]


Training Time (ms): 189126.2867450714, Prediction Time (ms): 2.9969215393066406
Experiment 10


100%|██████████| 93/93 [03:08<00:00,  2.03s/it]

Training Time (ms): 188969.59924697876, Prediction Time (ms): 4.082918167114258
Mean Training Time (ms): 191372.8861808777
Mean Prediction Time (ms): 8.279967308044434
Silhouette Score: 0.0023
Davies-Bouldin Index: 3.3818





In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/factorymethod'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "factorymethod" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Debug: Print column names to verify
print("Columns in DataFrame after saving:", df.columns)

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Verify if the 'Content' column exists
print("Columns after loading CSV:", df.columns)  # Check if 'Content' is present

# Check the first few rows of the DataFrame
print("DataFrame head after loading CSV:")
print(df.head())

# Start timing for embedding extraction
start_time = time.time()

# Embedding Extraction - Example using RoBERTa Model

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Example: Splitting data into training and testing using StratifiedKFold
labels = df['Label'].values
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train the KNeighborsClassifier model
knn_models = []
for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
    print(f"Fold {fold + 1}")

    # Split into training and testing sets
    X_train, X_test = embeddings[train_idx], embeddings[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Example classifier: KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    knn_models.append(knn)

    # Make predictions
    y_pred = knn.predict(X_test)

    # Evaluate the classifier
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# --- Prediction Phase ---

def predict_program(program_path, knn_model):
    # Read and embed the new program
    with open(program_path, 'r', encoding='iso-8859-1') as f:
        new_program_content = f.read()

    # Tokenize and encode the new program content
    inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Extract embedding
    with torch.no_grad():
        outputs = model(**inputs)
    new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Feed the embedding into the KNN model to predict
    start_time = time.time()  # Start timing prediction
    predicted_label = knn_model.predict(new_embedding)
    end_time = time.time()  # End timing prediction

    print(f"Time taken for prediction: {end_time - start_time:.2f} seconds")
    return predicted_label

# Example usage:
new_program_path = '/content/newexample/builder (4).java'  # Path to the new program file
# Use the first KNN model (trained on the first fold) for prediction
predicted_value = predict_program(new_program_path, knn_models[0])
print(f"Predicted label for the new program: {predicted_value}")


Using device: cuda
Time taken to load and label programs: 0.01 seconds
Time taken to create DataFrame and save to CSV: 0.04 seconds
Labels and contents saved to embeddings/af3.csv
Columns in DataFrame after saving: Index(['File Name', 'Label', 'Content'], dtype='object')
Time taken to load DataFrame from CSV: 0.02 seconds
Columns after loading CSV: Index(['File Name', 'Label', 'Content'], dtype='object')
DataFrame head after loading CSV:
         File Name  Label                                            Content
0   nonfm (6).java      0  package com.jmonkey.office.lexi.support.editor...
1  nonfm (33).java      0  /*\n * @(#)Figure.java 5.1\n *\n */\n\npackage...
2  nonfm (48).java      0  /*\n *  Author:  Chris Seguin\n *\n *  This so...
3  nonfm (35).java      0  /**\n *\n    QuickUML; A simple UML tool that ...
4  nonfm (76).java      0  /*\n * @(#)BouncingDrawing.java 5.1\n *\n */\n...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 93/93 [00:06<00:00, 15.10it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Time taken to extract embeddings: 7.22 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Fold 1
Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Fold 2
Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Fold 3
Precision: 0.6667, Recall: 0.6667, F1-Score: 0.6667
Fold 4
Precision: 0.3333, Recall: 0.5000, F1-Score: 0.4000
Fold 5
Precision: 1.0000, Recall: 0.5000, F1-Score: 0.6667
Time taken for prediction: 0.00 seconds
Predicted label for the new program: [0]


In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/factorymethod'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "factorymethod" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Debug: Print column names to verify
print("Columns in DataFrame after saving:", df.columns)

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Verify if the 'Content' column exists
print("Columns after loading CSV:", df.columns)  # Check if 'Content' is present

# Check the first few rows of the DataFrame
print("DataFrame head after loading CSV:")
print(df.head())

# Start timing for embedding extraction
start_time = time.time()

# Embedding Extraction - Example using RoBERTa Model

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Example: Splitting data into training and testing using StratifiedKFold
labels = df['Label'].values
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train the KNeighborsClassifier model
knn_models = []
for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
    print(f"Fold {fold + 1}")

    # Split into training and testing sets
    X_train, X_test = embeddings[train_idx], embeddings[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Example classifier: KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    knn_models.append(knn)

    # Make predictions
    y_pred = knn.predict(X_test)

    # Evaluate the classifier
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "prediction_time": f"{end_time - start_time:.2f} seconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the first KNN model (trained on the first fold) for prediction
predictions = predict_directory(unseen_java_folder, knn_models[0])

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, Time Taken: {result['prediction_time']}")


Using device: cuda
Time taken to load and label programs: 0.01 seconds
Time taken to create DataFrame and save to CSV: 0.02 seconds
Labels and contents saved to embeddings/af3.csv
Columns in DataFrame after saving: Index(['File Name', 'Label', 'Content'], dtype='object')
Time taken to load DataFrame from CSV: 0.01 seconds
Columns after loading CSV: Index(['File Name', 'Label', 'Content'], dtype='object')
DataFrame head after loading CSV:
         File Name  Label                                            Content
0   nonfm (6).java      0  package com.jmonkey.office.lexi.support.editor...
1  nonfm (33).java      0  /*\n * @(#)Figure.java 5.1\n *\n */\n\npackage...
2  nonfm (48).java      0  /*\n *  Author:  Chris Seguin\n *\n *  This so...
3  nonfm (35).java      0  /**\n *\n    QuickUML; A simple UML tool that ...
4  nonfm (76).java      0  /*\n * @(#)BouncingDrawing.java 5.1\n *\n */\n...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 79/79 [00:03<00:00, 20.89it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Time taken to extract embeddings: 4.58 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Fold 1
Precision: 0.5000, Recall: 1.0000, F1-Score: 0.6667
Fold 2
Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Fold 3
Precision: 1.0000, Recall: 0.5000, F1-Score: 0.6667
Fold 4
Precision: 1.0000, Recall: 1.0000, F1-Score: 1.0000
Fold 5
Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Program: nonfm (16).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: nonfm (15).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: nonfm (19).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: nonfm (17).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: nonfm (18).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: factorymethod (10).java, Predicted Label: 1, Time Taken: 0.00 seconds
Program: factorymethod (9).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: nonfm (14).java, Predicted Label: 0, Time Taken

**Prediction time for Singleton**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Debug: Print column names to verify
print("Columns in DataFrame after saving:", df.columns)

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Verify if the 'Content' column exists
print("Columns after loading CSV:", df.columns)  # Check if 'Content' is present

# Check the first few rows of the DataFrame
print("DataFrame head after loading CSV:")
print(df.head())

# Start timing for embedding extraction
start_time = time.time()

# Embedding Extraction - Example using RoBERTa Model

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Example: Splitting data into training and testing using StratifiedKFold
labels = df['Label'].values
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train the KNeighborsClassifier model
knn_models = []
for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
    print(f"Fold {fold + 1}")

    # Split into training and testing sets
    X_train, X_test = embeddings[train_idx], embeddings[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Example classifier: KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    knn_models.append(knn)

    # Make predictions
    y_pred = knn.predict(X_test)

    # Evaluate the classifier
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "prediction_time": f"{end_time - start_time:.2f} seconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the first KNN model (trained on the first fold) for prediction
predictions = predict_directory(unseen_java_folder, knn_models[0])

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, Time Taken: {result['prediction_time']}")


Using device: cuda
Time taken to load and label programs: 0.00 seconds
Time taken to create DataFrame and save to CSV: 0.02 seconds
Labels and contents saved to embeddings/af3.csv
Columns in DataFrame after saving: Index(['File Name', 'Label', 'Content'], dtype='object')
Time taken to load DataFrame from CSV: 0.01 seconds
Columns after loading CSV: Index(['File Name', 'Label', 'Content'], dtype='object')
DataFrame head after loading CSV:
             File Name  Label  \
0       nons (27).java      0   
1  singleton (19).java      1   
2       nons (56).java      0   
3        nons (2).java      0   
4       nons (20).java      0   

                                             Content  
0  /**\n * Form -\n *\n * Copyright (c) 2002\n * ...  
1  package junit.runner;   \n    \n/**   \n * Thi...  
2  /*\n * @(#)RectangleFigure.java 5.1\n *\n */\n...  
3  /*\n *                 Sun Public License Noti...  
4  /*\n *                 Sun Public License Noti...  


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 85/85 [00:04<00:00, 19.78it/s]


Time taken to extract embeddings: 5.17 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Fold 1
Precision: 0.6667, Recall: 0.4000, F1-Score: 0.5000
Fold 2
Precision: 1.0000, Recall: 0.6000, F1-Score: 0.7500
Fold 3
Precision: 0.6667, Recall: 0.4000, F1-Score: 0.5000
Fold 4
Precision: 0.6667, Recall: 0.8000, F1-Score: 0.7273
Fold 5
Precision: 0.7500, Recall: 0.6000, F1-Score: 0.6667
Program: prototype (18).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: prototype (17).java, Predicted Label: 0, Time Taken: 0.00 seconds


**Sleton time calculation**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Debug: Print column names to verify
print("Columns in DataFrame after saving:", df.columns)

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Define the path to the CSV file containing program names, labels, and content
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the actual CSV file path

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Verify if the 'Content' column exists
print("Columns after loading CSV:", df.columns)  # Check if 'Content' is present

# Check the first few rows of the DataFrame
print("DataFrame head after loading CSV:")
print(df.head())

# Start timing for embedding extraction
start_time = time.time()

# Embedding Extraction - Example using RoBERTa Model

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Example: Splitting data into training and testing using StratifiedKFold
labels = df['Label'].values
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train the KNeighborsClassifier model
knn_models = []
for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
    print(f"Fold {fold + 1}")

    # Split into training and testing sets
    X_train, X_test = embeddings[train_idx], embeddings[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    # Example classifier: KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    knn_models.append(knn)

    # Make predictions
    y_pred = knn.predict(X_test)

    # Evaluate the classifier
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "prediction_time": f"{end_time - start_time:.2f} seconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the first KNN model (trained on the first fold) for prediction
predictions = predict_directory(unseen_java_folder, knn_models[0])

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, Time Taken: {result['prediction_time']}")


Using device: cpu
Time taken to load and label programs: 0.01 seconds
Time taken to create DataFrame and save to CSV: 0.03 seconds
Labels and contents saved to embeddings/af3.csv
Columns in DataFrame after saving: Index(['File Name', 'Label', 'Content'], dtype='object')
Time taken to load DataFrame from CSV: 0.01 seconds
Columns after loading CSV: Index(['File Name', 'Label', 'Content'], dtype='object')
DataFrame head after loading CSV:
             File Name  Label  \
0       nons (68).java      0   
1       nons (40).java      0   
2        nons (9).java      0   
3       nons (27).java      0   
4  singleton (13).java      1   

                                             Content  
0  /*\n * @(#)ConnectionTool.java 5.1\n *\n */\n\...  
1  /*\n * @(#)ElbowConnection.java 5.1\n *\n */\n...  
2  /*\n *                 Sun Public License Noti...  
3  /**\n * Form -\n *\n * Copyright (c) 2002\n * ...  
4  /*\n * Author:  Chris Seguin\n *\n * This soft...  


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 85/85 [03:09<00:00,  2.23s/it]


Time taken to extract embeddings: 190.06 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Fold 1
Precision: 0.7500, Recall: 0.6000, F1-Score: 0.6667
Fold 2
Precision: 0.0000, Recall: 0.0000, F1-Score: 0.0000
Fold 3
Precision: 0.7500, Recall: 0.6000, F1-Score: 0.6667
Fold 4
Precision: 0.8000, Recall: 0.8000, F1-Score: 0.8000
Fold 5
Precision: 1.0000, Recall: 0.6000, F1-Score: 0.7500
Program: 0 (9).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: 0 (6).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: 0 (14).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: 0 (5).java, Predicted Label: 1, Time Taken: 0.00 seconds
Program: 0 (10).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: 0 (32).java, Predicted Label: 0, Time Taken: 0.00 seconds
Program: 0 (7).java, Predicted Label: 0, Time Taken: 0.00 seconds


**Prototype, Standard deviation, training and prediction time calculation**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/prototype'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "prototype" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Start timing for embedding extraction
start_time = time.time()

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Define a constant for the number of runs
NUM_RUNS = 10  # Change this value to set the number of runs

# Function to run training and evaluation
def run_knn_experiment(embeddings, labels, num_runs=NUM_RUNS):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    total_training_time = 0

    # Convert labels to a NumPy array for proper indexing
    labels = np.array(labels)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for run in range(num_runs):
        print(f"Run {run + 1}")
        start_time = time.time()

        # Train and evaluate on each fold
        for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
            print(f"  Fold {fold + 1}")

            # Split into training and testing sets
            X_train, X_test = embeddings[train_idx], embeddings[test_idx]
            y_train, y_test = labels[train_idx], labels[test_idx]

            # Example classifier: KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)

            # Make predictions
            y_pred = knn.predict(X_test)

            # Evaluate the classifier
            precision = precision_score(y_test, y_pred, average='binary')
            recall = recall_score(y_test, y_pred, average='binary')
            f1 = f1_score(y_test, y_pred, average='binary')

            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        end_time = time.time()
        training_time = (end_time - start_time) * 1_000_000  # Convert to microseconds
        total_training_time += training_time

    mean_precision = np.nanmean(precision_scores)
    mean_recall = np.nanmean(recall_scores)
    mean_f1 = np.nanmean(f1_scores)

    std_precision = np.nanstd(precision_scores)
    std_recall = np.nanstd(recall_scores)
    std_f1 = np.nanstd(f1_scores)

    print(f"Total Training Time for {num_runs} runs: {total_training_time:.2f} microseconds")
    print(f"Mean Precision: {mean_precision:.4f}, Standard Deviation: {std_precision:.4f}")
    print(f"Mean Recall: {mean_recall:.4f}, Standard Deviation: {std_recall:.4f}")
    print(f"Mean F1-Score: {mean_f1:.4f}, Standard Deviation: {std_f1:.4f}")

    return knn  # Return the last trained KNN model for predictions

knn_model = run_knn_experiment(embeddings, labels)

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            start_time = time.time()  # Start timing embedding extraction
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            end_time = time.time()  # End timing embedding extraction

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "embedding_extraction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds",
                "prediction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the last KNN model trained for prediction
predictions = predict_directory(unseen_java_folder, knn_model)

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, "
          f"Embedding Extraction Time: {result['embedding_extraction_time']}, "
          f"Prediction Time: {result['prediction_time']}")


Using device: cpu
Time taken to load and label programs: 0.00 seconds
Time taken to create DataFrame and save to CSV: 0.02 seconds
Labels and contents saved to embeddings/af3.csv
Time taken to load DataFrame from CSV: 0.01 seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 70/70 [03:14<00:00,  2.77s/it]


Time taken to extract embeddings: 194.76 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Run 1
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 2
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 3
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 4
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 5
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 6
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 7
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 8
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 9
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 10
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Total Training Time for 10 runs: 502141.48 microseconds
Mean Precision: 0.9464, Standard Deviation: 0.0659
Mean Recall: 0.8048, Standard Deviation: 0.2458
Mean F1-Score: 0.8399, Standard Deviation: 0.1761
Program: nonp (17).java, Predicted Label: 1, Embedding Extraction Time: 1921.18 microseconds, Prediction Time: 1921.18 microseconds
Program: nonp

In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/embeddings'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/embeddings' has been deleted.


**Singleton, Standard deviation, training and prediction time calculation**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Start timing for embedding extraction
start_time = time.time()

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Define a constant for the number of runs
NUM_RUNS = 10  # Change this value to set the number of runs

# Function to run training and evaluation
def run_knn_experiment(embeddings, labels, num_runs=NUM_RUNS):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    total_training_time = 0

    # Convert labels to a NumPy array for proper indexing
    labels = np.array(labels)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for run in range(num_runs):
        print(f"Run {run + 1}")
        start_time = time.time()

        # Train and evaluate on each fold
        for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
            print(f"  Fold {fold + 1}")

            # Split into training and testing sets
            X_train, X_test = embeddings[train_idx], embeddings[test_idx]
            y_train, y_test = labels[train_idx], labels[test_idx]

            # Example classifier: KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)

            # Make predictions
            y_pred = knn.predict(X_test)

            # Evaluate the classifier
            precision = precision_score(y_test, y_pred, average='binary')
            recall = recall_score(y_test, y_pred, average='binary')
            f1 = f1_score(y_test, y_pred, average='binary')

            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        end_time = time.time()
        training_time = (end_time - start_time) * 1_000_000  # Convert to microseconds
        total_training_time += training_time

    mean_precision = np.nanmean(precision_scores)
    mean_recall = np.nanmean(recall_scores)
    mean_f1 = np.nanmean(f1_scores)

    std_precision = np.nanstd(precision_scores)
    std_recall = np.nanstd(recall_scores)
    std_f1 = np.nanstd(f1_scores)

    print(f"Total Training Time for {num_runs} runs: {total_training_time:.2f} microseconds")
    print(f"Mean Precision: {mean_precision:.4f}, Standard Deviation: {std_precision:.4f}")
    print(f"Mean Recall: {mean_recall:.4f}, Standard Deviation: {std_recall:.4f}")
    print(f"Mean F1-Score: {mean_f1:.4f}, Standard Deviation: {std_f1:.4f}")

    return knn  # Return the last trained KNN model for predictions

knn_model = run_knn_experiment(embeddings, labels)

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            start_time = time.time()  # Start timing embedding extraction
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            end_time = time.time()  # End timing embedding extraction

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "embedding_extraction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds",
                "prediction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the last KNN model trained for prediction
predictions = predict_directory(unseen_java_folder, knn_model)

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, "
          f"Embedding Extraction Time: {result['embedding_extraction_time']}, "
          f"Prediction Time: {result['prediction_time']}")


Using device: cpu
Time taken to load and label programs: 0.01 seconds
Time taken to create DataFrame and save to CSV: 0.05 seconds
Labels and contents saved to embeddings/af3.csv
Time taken to load DataFrame from CSV: 0.03 seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 79/79 [03:53<00:00,  2.96s/it]


Time taken to extract embeddings: 234.48 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Run 1
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 2
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 3
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 4
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 5
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 6
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 7
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 8
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 9
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 10
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Total Training Time for 10 runs: 630822.90 microseconds
Mean Precision: 0.7210, Standard Deviation: 0.1686
Mean Recall: 0.6400, Standard Deviation: 0.2653
Mean F1-Score: 0.6218, Standard Deviation: 0.1837
Program: nons (30).java, Predicted Label: 0, Embedding Extraction Time: 1634.60 microseconds, Prediction Time: 1634.60 microseconds
Program: sing

In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/builder'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/builder' has been deleted.


In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/builder'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "builder" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Start timing for embedding extraction
start_time = time.time()

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Define a constant for the number of runs
NUM_RUNS = 10  # Change this value to set the number of runs

# Function to run training and evaluation
def run_knn_experiment(embeddings, labels, num_runs=NUM_RUNS):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    total_training_time = 0

    # Convert labels to a NumPy array for proper indexing
    labels = np.array(labels)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for run in range(num_runs):
        print(f"Run {run + 1}")
        start_time = time.time()

        # Train and evaluate on each fold
        for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
            print(f"  Fold {fold + 1}")

            # Split into training and testing sets
            X_train, X_test = embeddings[train_idx], embeddings[test_idx]
            y_train, y_test = labels[train_idx], labels[test_idx]

            # Example classifier: KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)

            # Make predictions
            y_pred = knn.predict(X_test)

            # Evaluate the classifier
            precision = precision_score(y_test, y_pred, average='binary')
            recall = recall_score(y_test, y_pred, average='binary')
            f1 = f1_score(y_test, y_pred, average='binary')

            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        end_time = time.time()
        training_time = (end_time - start_time) * 1_000_000  # Convert to microseconds
        total_training_time += training_time

    mean_precision = np.nanmean(precision_scores)
    mean_recall = np.nanmean(recall_scores)
    mean_f1 = np.nanmean(f1_scores)

    std_precision = np.nanstd(precision_scores)
    std_recall = np.nanstd(recall_scores)
    std_f1 = np.nanstd(f1_scores)

    print(f"Total Training Time for {num_runs} runs: {total_training_time:.2f} microseconds")
    print(f"Mean Precision: {mean_precision:.4f}, Standard Deviation: {std_precision:.4f}")
    print(f"Mean Recall: {mean_recall:.4f}, Standard Deviation: {std_recall:.4f}")
    print(f"Mean F1-Score: {mean_f1:.4f}, Standard Deviation: {std_f1:.4f}")

    return knn  # Return the last trained KNN model for predictions

knn_model = run_knn_experiment(embeddings, labels)

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            start_time = time.time()  # Start timing embedding extraction
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            end_time = time.time()  # End timing embedding extraction

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "embedding_extraction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds",
                "prediction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the last KNN model trained for prediction
predictions = predict_directory(unseen_java_folder, knn_model)

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, "
          f"Embedding Extraction Time: {result['embedding_extraction_time']}, "
          f"Prediction Time: {result['prediction_time']}")


Using device: cpu
Time taken to load and label programs: 0.01 seconds
Time taken to create DataFrame and save to CSV: 0.03 seconds
Labels and contents saved to embeddings/af3.csv
Time taken to load DataFrame from CSV: 0.02 seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 105/105 [04:17<00:00,  2.45s/it]


Time taken to extract embeddings: 257.76 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Run 1
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 2
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 3
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 4
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 5
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 6
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 7
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 8
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 9
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 10
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Total Training Time for 10 runs: 648471.59 microseconds
Mean Precision: 0.8667, Standard Deviation: 0.2667
Mean Recall: 0.6000, Standard Deviation: 0.2000
Mean F1-Score: 0.6800, Standard Deviation: 0.1904
Program: builder (5).java, Predicted Label: 1, Embedding Extraction Time: 2240.66 microseconds, Prediction Time: 2240.66 microseconds
Program: bu

**Abstract factory, standard deviation, training time, prediction time.**

In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/embeddings'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/embeddings' has been deleted.


In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/abstractfactory'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "abstractfactory" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Start timing for embedding extraction
start_time = time.time()

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Define a constant for the number of runs
NUM_RUNS = 10  # Change this value to set the number of runs

# Function to run training and evaluation
def run_knn_experiment(embeddings, labels, num_runs=NUM_RUNS):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    total_training_time = 0

    # Convert labels to a NumPy array for proper indexing
    labels = np.array(labels)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for run in range(num_runs):
        print(f"Run {run + 1}")
        start_time = time.time()

        # Train and evaluate on each fold
        for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
            print(f"  Fold {fold + 1}")

            # Split into training and testing sets
            X_train, X_test = embeddings[train_idx], embeddings[test_idx]
            y_train, y_test = labels[train_idx], labels[test_idx]

            # Example classifier: KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)

            # Make predictions
            y_pred = knn.predict(X_test)

            # Evaluate the classifier
            precision = precision_score(y_test, y_pred, average='binary')
            recall = recall_score(y_test, y_pred, average='binary')
            f1 = f1_score(y_test, y_pred, average='binary')

            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        end_time = time.time()
        training_time = (end_time - start_time) * 1_000_000  # Convert to microseconds
        total_training_time += training_time

    mean_precision = np.nanmean(precision_scores)
    mean_recall = np.nanmean(recall_scores)
    mean_f1 = np.nanmean(f1_scores)

    std_precision = np.nanstd(precision_scores)
    std_recall = np.nanstd(recall_scores)
    std_f1 = np.nanstd(f1_scores)

    print(f"Total Training Time for {num_runs} runs: {total_training_time:.2f} microseconds")
    print(f"Mean Precision: {mean_precision:.4f}, Standard Deviation: {std_precision:.4f}")
    print(f"Mean Recall: {mean_recall:.4f}, Standard Deviation: {std_recall:.4f}")
    print(f"Mean F1-Score: {mean_f1:.4f}, Standard Deviation: {std_f1:.4f}")

    return knn  # Return the last trained KNN model for predictions

knn_model = run_knn_experiment(embeddings, labels)

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            start_time = time.time()  # Start timing embedding extraction
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            end_time = time.time()  # End timing embedding extraction

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "embedding_extraction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds",
                "prediction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the last KNN model trained for prediction
predictions = predict_directory(unseen_java_folder, knn_model)

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, "
          f"Embedding Extraction Time: {result['embedding_extraction_time']}, "
          f"Prediction Time: {result['prediction_time']}")


Using device: cpu
Time taken to load and label programs: 0.01 seconds
Time taken to create DataFrame and save to CSV: 0.02 seconds
Labels and contents saved to embeddings/af3.csv
Time taken to load DataFrame from CSV: 0.01 seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 90/90 [03:55<00:00,  2.62s/it]


Time taken to extract embeddings: 236.38 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Run 1
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 2
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 3
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 4
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 5
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 6
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 7
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 8
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 9
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 10
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Total Training Time for 10 runs: 499723.67 microseconds
Mean Precision: 0.7976, Standard Deviation: 0.1746
Mean Recall: 0.8667, Standard Deviation: 0.1633
Mean F1-Score: 0.8102, Standard Deviation: 0.1147
Program: abstractfactory (3).java, Predicted Label: 1, Embedding Extraction Time: 1977.44 microseconds, Prediction Time: 1977.44 microseconds
Pro

In [None]:
import shutil

# Specify the directory path you want to delete
dir_path = '/content/embeddings'

# Remove the directory and its contents
shutil.rmtree(dir_path)

print(f"Directory '{dir_path}' has been deleted.")


Directory '/content/embeddings' has been deleted.


**Factory method, standard deviation, training time, prediction time calculation**

In [None]:
import os
import pandas as pd
import torch
import numpy as np
import random
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the path to the folder containing your Java programs
java_programs_folder = '/content/factorymethod'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Start timing for loading programs
start_time = time.time()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "factorymethod" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# End timing for loading programs
end_time = time.time()
print(f"Time taken to load and label programs: {end_time - start_time:.2f} seconds")

# Start timing for creating DataFrame and saving to CSV
start_time = time.time()

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Ensure the directory exists
output_dir = 'embeddings'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the DataFrame to a CSV file
csv_path = os.path.join(output_dir, 'af3.csv')  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

end_time = time.time()
print(f"Time taken to create DataFrame and save to CSV: {end_time - start_time:.2f} seconds")
print(f"Labels and contents saved to {csv_path}")

# Start timing for loading DataFrame from CSV
start_time = time.time()

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

end_time = time.time()
print(f"Time taken to load DataFrame from CSV: {end_time - start_time:.2f} seconds")

# Start timing for embedding extraction
start_time = time.time()

# Load pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base").to(device)

# Tokenize and encode the content using RoBERTa
embeddings = []

for content in tqdm(df['Content'], desc="Processing embeddings"):
    # Tokenize and encode
    inputs = tokenizer(content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

    # Get the output embeddings from the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    # Store the embedding
    embeddings.append(cls_embedding)

# End timing for embedding extraction
end_time = time.time()
print(f"Time taken to extract embeddings: {end_time - start_time:.2f} seconds")

# Convert the list of embeddings into a numpy array
embeddings = np.vstack(embeddings)

# Save embeddings to a numpy file
start_time = time.time()
np.save(os.path.join(output_dir, "embeddings.npy"), embeddings)
end_time = time.time()
print(f"Time taken to save embeddings: {end_time - start_time:.2f} seconds")
print(f"Embeddings saved to {os.path.join(output_dir, 'embeddings.npy')}")

# Define a constant for the number of runs
NUM_RUNS = 10  # Change this value to set the number of runs

# Function to run training and evaluation
def run_knn_experiment(embeddings, labels, num_runs=NUM_RUNS):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    total_training_time = 0

    # Convert labels to a NumPy array for proper indexing
    labels = np.array(labels)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for run in range(num_runs):
        print(f"Run {run + 1}")
        start_time = time.time()

        # Train and evaluate on each fold
        for fold, (train_idx, test_idx) in enumerate(skf.split(embeddings, labels)):
            print(f"  Fold {fold + 1}")

            # Split into training and testing sets
            X_train, X_test = embeddings[train_idx], embeddings[test_idx]
            y_train, y_test = labels[train_idx], labels[test_idx]

            # Example classifier: KNeighborsClassifier
            knn = KNeighborsClassifier(n_neighbors=3)
            knn.fit(X_train, y_train)

            # Make predictions
            y_pred = knn.predict(X_test)

            # Evaluate the classifier
            precision = precision_score(y_test, y_pred, average='binary')
            recall = recall_score(y_test, y_pred, average='binary')
            f1 = f1_score(y_test, y_pred, average='binary')

            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)

        end_time = time.time()
        training_time = (end_time - start_time) * 1_000_000  # Convert to microseconds
        total_training_time += training_time

    mean_precision = np.nanmean(precision_scores)
    mean_recall = np.nanmean(recall_scores)
    mean_f1 = np.nanmean(f1_scores)

    std_precision = np.nanstd(precision_scores)
    std_recall = np.nanstd(recall_scores)
    std_f1 = np.nanstd(f1_scores)

    print(f"Total Training Time for {num_runs} runs: {total_training_time:.2f} microseconds")
    print(f"Mean Precision: {mean_precision:.4f}, Standard Deviation: {std_precision:.4f}")
    print(f"Mean Recall: {mean_recall:.4f}, Standard Deviation: {std_recall:.4f}")
    print(f"Mean F1-Score: {mean_f1:.4f}, Standard Deviation: {std_f1:.4f}")

    return knn  # Return the last trained KNN model for predictions

knn_model = run_knn_experiment(embeddings, labels)

# --- Prediction Phase for Unseen Java Programs ---

def predict_directory(unseen_folder, knn_model):
    predictions = {}

    # Process each unseen Java program in the directory
    for program_file in os.listdir(unseen_folder):
        file_path = os.path.join(unseen_folder, program_file)

        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                new_program_content = f.read()

            # Tokenize and encode the new program content
            inputs = tokenizer(new_program_content, return_tensors="pt", truncation=True, padding="max_length", max_length=512).to(device)

            # Extract embedding
            start_time = time.time()  # Start timing embedding extraction
            with torch.no_grad():
                outputs = model(**inputs)
            new_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            end_time = time.time()  # End timing embedding extraction

            # Feed the embedding into the KNN model to predict
            start_time = time.time()  # Start timing prediction
            predicted_label = knn_model.predict(new_embedding)
            end_time = time.time()  # End timing prediction

            predictions[program_file] = {
                "predicted_label": predicted_label[0],
                "embedding_extraction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds",
                "prediction_time": f"{(end_time - start_time) * 1_000_000:.2f} microseconds"
            }

    return predictions

# Example usage:
unseen_java_folder = '/content/newexample'  # Replace with the folder containing unseen Java programs

# Use the last KNN model trained for prediction
predictions = predict_directory(unseen_java_folder, knn_model)

# Print the predictions and their respective times
for program_file, result in predictions.items():
    print(f"Program: {program_file}, Predicted Label: {result['predicted_label']}, "
          f"Embedding Extraction Time: {result['embedding_extraction_time']}, "
          f"Prediction Time: {result['prediction_time']}")


Using device: cpu
Time taken to load and label programs: 0.01 seconds
Time taken to create DataFrame and save to CSV: 0.03 seconds
Labels and contents saved to embeddings/af3.csv
Time taken to load DataFrame from CSV: 0.03 seconds


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing embeddings: 100%|██████████| 78/78 [03:40<00:00,  2.82s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Time taken to extract embeddings: 221.08 seconds
Time taken to save embeddings: 0.00 seconds
Embeddings saved to embeddings/embeddings.npy
Run 1
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 2
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 3
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 4
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 5
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 6
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 7
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 8
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Run 9
  Fold 1
  Fold 2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Fold 3
  Fold 4
  Fold 5
Run 10
  Fold 1
  Fold 2
  Fold 3
  Fold 4
  Fold 5
Total Training Time for 10 runs: 505620.48 microseconds
Mean Precision: 0.5667, Standard Deviation: 0.3266
Mean Recall: 0.6333, Standard Deviation: 0.3712
Mean F1-Score: 0.5800, Standard Deviation: 0.3124


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Program: factorymethod (8).java, Predicted Label: 1, Embedding Extraction Time: 2053.26 microseconds, Prediction Time: 2053.26 microseconds
Program: factorymethod (5).java, Predicted Label: 1, Embedding Extraction Time: 1656.06 microseconds, Prediction Time: 1656.06 microseconds
Program: nonfm (2).java, Predicted Label: 0, Embedding Extraction Time: 1658.68 microseconds, Prediction Time: 1658.68 microseconds
Program: nonfm (1).java, Predicted Label: 0, Embedding Extraction Time: 1659.39 microseconds, Prediction Time: 1659.39 microseconds
