In [1]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (54).java, Predicted Label: 1, True Label: 1
File: nons (29).java, Predicted Label: 1, True Label: 0
File: nons (25).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 0, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (61).java, Predicted Label: 1, True Label: 1
Precision: 0.7777777777777778, Recall: 0.7777777777777778, F-score: 0.7777777777777778


In [None]:
Singleton on different setting of programs

In [2]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (54).java, Predicted Label: 1, True Label: 1
File: nons (25).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 0, True Label: 0
File: nons (19).java, Predicted Label: 0, True Label: 0
File: nons (55).java, Predicted Label: 0, True Label: 0
File: nons (34).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8


In [None]:
Singleton on different setting of programs

In [3]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (12).java, Predicted Label: 1, True Label: 1
File: nons (50).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (24).java, Predicted Label: 0, True Label: 0
File: nons (4).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (14).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Singleton with different settings

In [4]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (25).java, Predicted Label: 1, True Label: 0
File: nons (42).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 0, True Label: 0
File: nons (55).java, Predicted Label: 1, True Label: 1
File: nons (34).java, Predicted Label: 1, True Label: 1
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (61).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
Precision: 0.9142857142857143, Recall: 0.9, F-score: 0.8967032967032967


In [None]:
Builder with different settings

In [5]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 1, True Label: 1
File: nonbuilder (55).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [6]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonb (14).java, Predicted Label: 1, True Label: 1
File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 0, True Label: 0
File: builder (6).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [7]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: builder (2).java, Predicted Label: 1, True Label: 1
File: nonb (89).java, Predicted Label: 0, True Label: 0
File: nonb (49).java, Predicted Label: 1, True Label: 1
File: nonb (28).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [8]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonb (14).java, Predicted Label: 1, True Label: 1
File: builder (2).java, Predicted Label: 0, True Label: 0
File: nonb (133).java, Predicted Label: 0, True Label: 0
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Builder with different settings

In [9]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'builder'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "builder" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: builder (2).java, Predicted Label: 1, True Label: 1
File: builder (5).java, Predicted Label: 1, True Label: 1
File: builder (6).java, Predicted Label: 1, True Label: 1
File: nonb (18).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [11]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonab (11).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (81).java, Predicted Label: 1, True Label: 1
File: nonab (7).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [12]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonab (11).java, Predicted Label: 1, True Label: 1
File: nonab (48).java, Predicted Label: 1, True Label: 1
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 1, True Label: 1
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [13]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonab (11).java, Predicted Label: 1, True Label: 1
File: nonab (48).java, Predicted Label: 1, True Label: 1
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nonab (5).java, Predicted Label: 1, True Label: 1
File: nonab (65).java, Predicted Label: 0, True Label: 0
File: nonab (7).java, Predicted Label: 1, True Label: 1
File: abstractfactory (10).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [15]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: non-DP (37).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: non-DP (43).java, Predicted Label: 1, True Label: 1
File: non-DP (42).java, Predicted Label: 1, True Label: 1
File: non-DP (39).java, Predicted Label: 0, True Label: 0
File: non-DP (31).java, Predicted Label: 1, True Label: 1
File: non-DP (38).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [16]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (18).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (29).java, Predicted Label: 1, True Label: 1
File: nondp (31).java, Predicted Label: 1, True Label: 1
File: nondp (11).java, Predicted Label: 1, True Label: 1
File: nondp (23).java, Predicted Label: 1, True Label: 1
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Abstract Factory with different settings

In [17]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (7).java, Predicted Label: 0, True Label: 0
File: nondp (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nondp (33).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 1, True Label: 0
File: nondp (45).java, Predicted Label: 1, True Label: 1
File: nondp (19).java, Predicted Label: 0, True Label: 0
File: nondp (26).java, Predicted Label: 1, True Label: 1
Precision: 0.90625, Recall: 0.875, F-score: 0.876984126984127


In [None]:
Abstract Factory with different settings

In [19]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "abstractfactory" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (7).java, Predicted Label: 0, True Label: 0
File: nondp (2).java, Predicted Label: 1, True Label: 1
File: nondp (18).java, Predicted Label: 0, True Label: 1
File: abstractfactory (2).java, Predicted Label: 0, True Label: 0
File: nondp (33).java, Predicted Label: 1, True Label: 1
File: nondp (29).java, Predicted Label: 0, True Label: 0
File: nondp (31).java, Predicted Label: 1, True Label: 1
File: nondp (11).java, Predicted Label: 0, True Label: 0
File: nondp (39).java, Predicted Label: 0, True Label: 0
File: nondp (45).java, Predicted Label: 0, True Label: 0
File: nondp (19).java, Predicted Label: 1, True Label: 1
File: nondp (23).java, Predicted Label: 0, True Label: 0
File: nondp (38).java, Predicted Label: 0, True Label: 0
File: nondp (30).java, Predicted Label: 0, True Label: 0
Precision: 0.9357142857142857, Recall: 0.9285714285714286, F-score: 0.926482873851295


In [None]:
Factory Method with different settings

In [20]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'factorymethod'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "factorymethod" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nondp (33).java, Predicted Label: 1, True Label: 1
File: nondp (5).java, Predicted Label: 1, True Label: 1
File: nondp (4).java, Predicted Label: 1, True Label: 1
File: nondp (1).java, Predicted Label: 0, True Label: 0
File: nondp (15).java, Predicted Label: 0, True Label: 0
File: nondp (34).java, Predicted Label: 1, True Label: 1
File: factorymethod (1).java, Predicted Label: 0, True Label: 0
File: factorymethod (9).java, Predicted Label: 0, True Label: 0
File: nondp (26).java, Predicted Label: 0, True Label: 0
File: nondp (8).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0


In [None]:
Factory Method with different settings

In [1]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'factorymethod'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "factorymethod" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonfm (6).java, Predicted Label: 1, True Label: 0
File: nonfm (5).java, Predicted Label: 0, True Label: 0
File: nonfm (13).java, Predicted Label: 0, True Label: 0
File: factorymethod (1).java, Predicted Label: 1, True Label: 0
File: factorymethod (9).java, Predicted Label: 1, True Label: 0
File: nonfm (4).java, Predicted Label: 1, True Label: 1
Precision: 0.875, Recall: 0.5, F-score: 0.5428571428571429


In [None]:
Factory Method with different settings

In [3]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'factorymethod'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "factorymethod" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nonfm (38).java, Predicted Label: 0, True Label: 0
File: nonfm (52).java, Predicted Label: 1, True Label: 1
File: nonfm (68).java, Predicted Label: 1, True Label: 1
File: nonfm (37).java, Predicted Label: 1, True Label: 0
File: nonfm (29).java, Predicted Label: 1, True Label: 1
File: factorymethod (1).java, Predicted Label: 1, True Label: 1
File: factorymethod (9).java, Predicted Label: 1, True Label: 1
Precision: 0.880952380952381, Recall: 0.8571428571428571, F-score: 0.8398268398268397


In [None]:
Prototype with different settings

In [4]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'prototype'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "prototype" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: prototype (27).java, Predicted Label: 1, True Label: 1
File: nonp (23).java, Predicted Label: 0, True Label: 1
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 0, True Label: 0
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 1, True Label: 1
File: nonp (13).java, Predicted Label: 0, True Label: 0
File: nonp (29).java, Predicted Label: 0, True Label: 0
File: prototype (15).java, Predicted Label: 1, True Label: 1
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (58).java, Predicted Label: 1, True Label: 1
File: prototype (14).java, Predicted Label: 1, True Label: 1
File: prototype (16).java, Predicted Label: 0, True Label: 0
Precision: 0.8846153846153846, Recall: 0.8461538461538461, F-score: 0.8443223443223442


In [None]:
Prototype with different settings

In [5]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'prototype'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "prototype" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: prototype (27).java, Predicted Label: 0, True Label: 0
File: nonp (53).java, Predicted Label: 0, True Label: 1
File: nonp (23).java, Predicted Label: 1, True Label: 1
File: nonp (43).java, Predicted Label: 0, True Label: 0
File: nonp (35).java, Predicted Label: 0, True Label: 0
File: nonp (51).java, Predicted Label: 0, True Label: 0
File: nonp (59).java, Predicted Label: 0, True Label: 0
File: nonp (5).java, Predicted Label: 0, True Label: 0
File: nonp (31).java, Predicted Label: 0, True Label: 0
File: prototype (13).java, Predicted Label: 0, True Label: 0
File: prototype (22).java, Predicted Label: 0, True Label: 1
File: nonp (7).java, Predicted Label: 0, True Label: 0
File: nonp (13).java, Predicted Label: 1, True Label: 1
File: nonp (29).java, Predicted Label: 1, True Label: 1
File: prototype (15).java, Predicted Label: 0, True Label: 0
File: nonp (44).java, Predicted Label: 0, True Label: 0
File: nonp (8).java, Predicted Label: 0, True Label: 0
File: nonp (37).java, Predicted

In [None]:
Prototype with different settings

In [None]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'prototype'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "prototype" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 5  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Function to get embeddings for a given design pattern
def get_embeddings_for_pattern(pattern, model, tokenizer):
    directory = os.path.join("all_design_patterns", pattern.lower())
    files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]

    embeddings = []
    true_labels = []

    for file in files:
        with open(os.path.join(directory, file), "r", encoding="ISO-8859-1") as f:
            code = f.read()

        # Tokenize and encode the Java program
        inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        program_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

        embeddings.append(program_embedding)
        true_labels.append(pattern)

    return np.array(embeddings), np.array(true_labels)

# Load the RoBERTa model and tokenizer
model_name = "microsoft/codebert-base"  # Replace with the correct RoBERTa model name
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get embeddings for each design pattern
patterns = ["Singleton", "Prototype", "AbstractFactory", "Builder", "FactoryMethod"]
all_embeddings = []
all_labels = []

# Custom color palette for each design pattern with higher contrast
color_palette = ["red", "green", "orange", "blue", "purple"]

# Custom markers for each design pattern
markers = ["o", "s", "D", "^", "P"]

for i, pattern in enumerate(patterns):
    pattern_embeddings, pattern_labels = get_embeddings_for_pattern(pattern, model, tokenizer)
    all_embeddings.append(pattern_embeddings)
    all_labels.append(pattern_labels)

# Concatenate the embeddings and labels
all_embeddings = np.concatenate(all_embeddings, axis=0)
all_labels = np.concatenate(all_labels)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(all_embeddings)

# Create a scatter plot for t-SNE visualization with custom symbols
plt.figure(figsize=(20, 16))

for i, pattern in enumerate(patterns):
    indices = all_labels == pattern
    sns.scatterplot(x=tsne_results[indices, 0], y=tsne_results[indices, 1], marker=markers[i], color=color_palette[i], s=200, label=pattern)

# Increase font sizes for better visibility
plt.title('t-SNE Visualization for RoBERTa on Different Design Patterns', fontsize=30)
plt.xlabel('t-SNE Dimension 1', fontsize=25)
plt.ylabel('t-SNE Dimension 2', fontsize=25)
plt.legend(title='Design Pattern', loc='upper right', fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.grid(True)

# Save the t-SNE plot as a PDF file
plt.savefig('tsne_plot_roberta.pdf', format='pdf')
plt.show()


In [1]:
#Singleton with different settings

In [2]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to calculate the distance matrix based on the chosen metric
def calculate_distance_matrix(embeddings, metric='cosine'):
    if metric == 'cosine':
        return cosine_distances(embeddings)
    elif metric == 'euclidean':
        return euclidean_distances(embeddings)
    else:
        raise ValueError(f"Unrecognized metric: {metric}")

# Switch between 'cosine' and 'euclidean'
distance_metric = 'euclidean'  # Change to 'euclidean' for Euclidean distance

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Check if the keyword "singleton" is present in the filename
            if "singleton" in program_file:
                program_labels.append(1)  # Positive class (implements singleton)
            else:
                program_labels.append(0)  # Negative class (does not implement singleton)

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Calculate the distance matrix based on the chosen metric
distance_matrix = calculate_distance_matrix(normalized_embeddings, metric=distance_metric)

# Get k-nearest neighbors indices for each program
k = 5  # Number of neighbors to consider
knn_indices = np.argsort(distance_matrix)[:, 1:k+1]

# Initialize an array to store predicted labels
predicted_labels = []

# Predict labels for each program based on the majority label of neighbors
for indices in knn_indices:
    neighbor_labels = [program_labels[i] for i in indices]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    predicted_labels.append(predicted_label)

# Convert predicted_labels to NumPy array for further analysis
predicted_labels = np.array(predicted_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, predicted_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (12).java, Predicted Label: 1, True Label: 0
File: nons (50).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 0
File: nons (4).java, Predicted Label: 1, True Label: 1
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 1, True Label: 1
File: nons (38).java, Predicted Label: 1, True Label: 1
Precision: 0.5289256198347108, Recall: 0.7272727272727273, F-score: 0.6124401913875599


  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
#Singleton with different settings

In [3]:
import os
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Function to extract embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    if "singleton" in file_name:
        return 1  # Positive class (implements singleton)
    else:
        return 0  # Negative class (does not implement singleton)

# Initialize lists to store Java programs and their corresponding labels
java_programs = []
program_labels = []

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_text = f.read()
            java_programs.append(program_text)

            # Label the program using the provided function
            program_labels.append(label_program(program_file))

# Calculate embeddings for the Java programs line by line and take the mean
program_embeddings = []
for program in java_programs:
    lines = program.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Apply standard scaling to normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(np.vstack(program_embeddings))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, program_labels, test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {os.listdir(java_programs_folder)[i]}, Predicted Label: {predictions[i]}, True Label: {y_test[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File: nons (12).java, Predicted Label: 0, True Label: 1
File: nons (50).java, Predicted Label: 1, True Label: 1
File: singleton (25).java, Predicted Label: 1, True Label: 1
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 0, True Label: 1
File: singleton (24).java, Predicted Label: 0, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 1, True Label: 1
File: nons (38).java, Predicted Label: 1, True Label: 1
Precision: 0.7467532467532467, Recall: 0.6363636363636364, F-score: 0.6742424242424243


In [12]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'singleton/sin.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to singleton/sin.csv


In [13]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'singleton/sin.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(program_embeddings, df['Label'], test_size=0.2, random_state=0)

# Apply k-nearest neighbor classification
n_neighbors = 3  # Number of neighbors to consider
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

# Calculate precision, recall, and f-score
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

# Print the classification results and performance metrics
for i in range(len(X_test)):
    print(f"File: {df['File Name'].iloc[i]}, Predicted Label: {predictions[i]}, True Label: {y_test.iloc[i]}")

print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [06:34<00:00,  7.30s/it]

File: nons (12).java, Predicted Label: 1, True Label: 1
File: nons (50).java, Predicted Label: 0, True Label: 1
File: singleton (25).java, Predicted Label: 0, True Label: 0
File: nons (27).java, Predicted Label: 1, True Label: 1
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (19).java, Predicted Label: 0, True Label: 1
File: singleton (24).java, Predicted Label: 0, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (44).java, Predicted Label: 1, True Label: 1
File: nons (38).java, Predicted Label: 1, True Label: 1
Precision: 0.890909090909091, Recall: 0.7272727272727273, F-score: 0.7584415584415585





In [14]:
#k-fold cross validation

In [17]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'singleton/sin.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Initialize an array to store predicted labels for each program in every fold
all_fold_predictions = np.zeros_like(df['Label'])

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    fold_predictions = knn.predict(X_test)

    # Store fold predictions in the array
    all_fold_predictions[test_index] = fold_predictions

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, fold_predictions, average='weighted')
    recall = recall_score(y_test, fold_predictions, average='weighted')
    f1 = f1_score(y_test, fold_predictions, average='weighted')

    print(f"Fold {fold + 1}: Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Calculate overall precision, recall, and f-score
precision = precision_score(df['Label'], all_fold_predictions, average='weighted')
recall = recall_score(df['Label'], all_fold_predictions, average='weighted')
f1 = f1_score(df['Label'], all_fold_predictions, average='weighted')

print(f"Overall Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Print the classification results for each program in every fold
for fold in range(n_splits):
    fold_predictions = all_fold_predictions[test_index]
    true_labels = df['Label'][test_index]

    for i in range(len(fold_predictions)):
        print(f"Fold {fold + 1}, File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {fold_predictions[i]}, True Label: {true_labels.iloc[i]}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [06:37<00:00,  7.36s/it]

Fold 1: Precision: 0.9242424242424243, Recall: 0.9090909090909091, F-score: 0.9090909090909091
Fold 2: Precision: 0.7393939393939394, Recall: 0.7272727272727273, F-score: 0.7272727272727272
Fold 3: Precision: 0.8701298701298701, Recall: 0.8181818181818182, F-score: 0.8151515151515152
Fold 4: Precision: 0.5303030303030304, Recall: 0.5454545454545454, F-score: 0.4935064935064935
Fold 5: Precision: 0.8571428571428571, Recall: 0.8, F-score: 0.7916666666666666
Overall Precision: 0.7589506172839506, Recall: 0.7592592592592593, F-score: 0.7588428968574248
Fold 1, File: singleton (25).java, Predicted Label: 0, True Label: 1
Fold 1, File: singleton (9).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (44).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (38).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (33).java, Predicted Label: 0, True Label: 0
Fold 1, File: singleton (11).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (21).java, Predicted Label: 




In [18]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'singleton/sin.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Initialize an array to store predicted labels for each program in every fold
all_fold_predictions = np.zeros_like(df['Label'])

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    fold_predictions = knn.predict(X_test)

    # Store fold predictions in the array
    all_fold_predictions[test_index] = fold_predictions

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, fold_predictions, average='weighted')
    recall = recall_score(y_test, fold_predictions, average='weighted')
    f1 = f1_score(y_test, fold_predictions, average='weighted')

    print(f"Fold {fold + 1}: Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Calculate overall precision, recall, and f-score
precision = precision_score(df['Label'], all_fold_predictions, average='weighted')
recall = recall_score(df['Label'], all_fold_predictions, average='weighted')
f1 = f1_score(df['Label'], all_fold_predictions, average='weighted')

print(f"Overall Precision: {precision}, Recall: {recall}, F-score: {f1}")

# Print the classification results for each program in every fold
for fold in range(n_splits):
    fold_predictions = all_fold_predictions[test_index]
    true_labels = df['Label'][test_index]

    for i in range(len(fold_predictions)):
        print(f"Fold {fold + 1}, File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {fold_predictions[i]}, True Label: {true_labels.iloc[i]}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [06:39<00:00,  7.39s/it]

Fold 1: Precision: 0.9242424242424243, Recall: 0.9090909090909091, F-score: 0.9090909090909091
Fold 2: Precision: 0.7393939393939394, Recall: 0.7272727272727273, F-score: 0.7272727272727272
Fold 3: Precision: 0.8701298701298701, Recall: 0.8181818181818182, F-score: 0.8151515151515152
Fold 4: Precision: 0.5303030303030304, Recall: 0.5454545454545454, F-score: 0.4935064935064935
Fold 5: Precision: 0.8571428571428571, Recall: 0.8, F-score: 0.7916666666666666
Overall Precision: 0.7589506172839506, Recall: 0.7592592592592593, F-score: 0.7588428968574248
Fold 1, File: singleton (25).java, Predicted Label: 0, True Label: 1
Fold 1, File: singleton (9).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (44).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (38).java, Predicted Label: 0, True Label: 0
Fold 1, File: nons (33).java, Predicted Label: 0, True Label: 0
Fold 1, File: singleton (11).java, Predicted Label: 1, True Label: 1
Fold 1, File: nons (21).java, Predicted Label: 




In [19]:
#singleton with different settings

In [20]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin1.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to embeddings/sin1.csv


In [22]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin1.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the training set
    train_predictions = knn.predict(X_train)
    for i in range(len(train_predictions)):
        print(f"Fold {fold + 1} (Train), File: {df['File Name'].iloc[train_index[i]]}, Predicted Label: {train_predictions[i]}, True Label: {y_train.iloc[i]}")

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    for i in range(len(test_predictions)):
        print(f"Fold {fold + 1} (Test), File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")

    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    print(f"Fold {fold + 1}: Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 52/52 [10:05<00:00, 11.65s/it]   


Fold 1 (Train), File: nons (29).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (25).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (25).java, Predicted Label: 0, True Label: 1
Fold 1 (Train), File: nons (42).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (18).java, Predicted Label: 1, True Label: 1
Fold 1 (Train), File: nons (34).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (49).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (9).java, Predicted Label: 1, True Label: 1
Fold 1 (Train), File: nons (44).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (38).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (14).java, Predicted Label: 0, True Label: 1
Fold 1 (Train), File: nons (33).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: nons (41).java, Predicted Label: 0, True Label: 0
Fold 1 (Train), File: singleton (1).java, Pre

In [24]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin1.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 52/52 [09:51<00:00, 11.38s/it]   
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (50).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (30).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (39).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (13).java, Predicted Label: 1, True Label: 1
File: nons (48).java, Predicted Label: 0, True Label: 0
File: nons (26).java, Predicted Label: 0, True Label: 0
File: nons (47).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 10 (Test):
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (35).java, Predicted Label: 0, True Label: 0
File: nons (43).java, Predicted Label: 0, True Label: 0
File: singl

In [25]:
#Singleton with different settings

In [26]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin2.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to embeddings/sin2.csv


In [27]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin2.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [03:10<00:00,  3.82s/it]


Fold 4 (Test):
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: singleton (3).java, Predicted Label: 0, True Label: 1
File: nons (16).java, Predicted Label: 0, True Label: 0
File: nons (6).java, Predicted Label: 0, True Label: 0
Precision: 0.85, Recall: 0.8, F-score: 0.7809523809523808
Fold 8 (Test):
File: nons (29).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (9).java, Predicted Label: 0, True Label: 0
File: singleton (5).java, Predicted Label: 0, True Label: 1
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 9 (Test):
File: nons (34).java, Predicted Label: 0, True Label: 0
File: singleton (4).java, Predicted Label: 1, True Label: 1
File: singleton (17).java, Predicted Label: 1, True Label: 1
File: singleton (7).java, Predicted Label: 1, True Label: 1
File: nons (45).java, P

In [28]:
#Singleton with different settings

In [29]:
import os
import pandas as pd

# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin3.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


Labels and contents saved to embeddings/sin3.csv


In [30]:
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin3.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 52/52 [03:22<00:00,  3.89s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (25).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (53).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (64).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 6 (Test):
File: nons (61).java, Predicted Label: 0, True Label: 0
File: nons (41).java, Predicted Label: 0, True Label: 0
File: singleton (11).java, Predicted Label: 1, True Label: 1
File: singleton (20).java, Predicted Label: 1, True Label: 1
File: nons (28).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 9 (Test):
File: nons (42).java, Predicted Label: 1, True Label: 0
File: nons (38).java, Predicted Label: 0, True Label: 0
File: singleton (22).java, Predicted Label: 1, True Label: 1
File: singl

In [31]:
#Singleton with different settings

In [32]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin4.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin4.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.7 and recall >= 0.7 and f1 >= 0.7:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 55/55 [03:27<00:00,  3.76s/it]


Fold 3 (Test):
File: singleton (1).java, Predicted Label: 0, True Label: 1
File: singleton (3).java, Predicted Label: 1, True Label: 1
File: nons (16).java, Predicted Label: 0, True Label: 0
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (51).java, Predicted Label: 0, True Label: 0
File: nons (58).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 6 (Test):
File: nons (44).java, Predicted Label: 0, True Label: 0
File: nons (48).java, Predicted Label: 0, True Label: 0
File: singleton (20).java, Predicted Label: 1, True Label: 1
File: nons (52).java, Predicted Label: 0, True Label: 0
File: singleton (2).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 7 (Test):
File: nons (21).java, Predicted Label: 0, True Label: 0
File: nons (10).java, Predicted Label: 0, True Label: 0
File: singleton (4).java, Predicted Label: 0, True Label: 1
File: nons (60).java, Predict

In [33]:
#Singleton using different setting

In [34]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin5.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin5.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.5 and recall >= 0.5 and f1 >= 0.5:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 49/49 [03:17<00:00,  4.03s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 2 (Test):
File: singleton (15).java, Predicted Label: 1, True Label: 1
File: nons (63).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: nons (67).java, Predicted Label: 0, True Label: 0
File: nons (58).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 3 (Test):
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (53).java, Predicted Label: 0, True Label: 0
File: nons (14).java, Predicted Label: 1, True Label: 0
File: singleton (22).java, Predicted Label: 1, True Label: 1
File: nons (15).java, Predicted Label: 0, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 6 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (49).java, Predicted Label: 0, True Label: 0
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (65).java, Predicted Label: 0, True Label: 0
File: singleton (2).java, Pre

In [35]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin6.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin6.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.5 and recall >= 0.5 and f1 >= 0.5:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:07<00:00,  3.67s/it]


Fold 1 (Test):
File: nons (50).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (63).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 0, True Label: 1
File: nons (64).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 2 (Test):
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (61).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (46).java, Predicted Label: 1, True Label: 0
File: nons (45).java, Predicted Label: 0, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (47).java, Predicted Label: 0, True Label: 0
File: nons (60).java, Predicted Label: 0, True Label: 0
File: single

In [None]:
#Singleton with different settings

In [36]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin7.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin7.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.5 and recall >= 0.5 and f1 >= 0.5:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin7.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 53/53 [04:45<00:00,  5.38s/it]


Fold 1 (Test):
File: nons (54).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (65).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 2 (Test):
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (6).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: singleton (5).java, Predicted Label: 0, True Label: 1
File: nons (24).java, Predicted Label: 0, True Label: 0
Precision: 0.875, Recall: 0.8333333333333334, F-score: 0.8285714285714286
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: singleton (13).java, Predicted Label: 1, T

In [None]:
#Singleton with different settings

In [37]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin8.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin8.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.4 and recall >= 0.4 and f1 >= 0.4:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin8.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [04:31<00:00,  5.33s/it]


Fold 1 (Test):
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (24).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 2 (Test):
File: nons (55).java, Predicted Label: 0, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (6).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (8).java, Predicted Label: 0, True Label: 0
File: nons (21).java, Predicted Label: 0, True Label: 0
File: nons (60).java, Predicted Label: 0, True Label: 0
File:

In [38]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin9.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin9.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin9.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:32<00:00,  4.17s/it]


Fold 1 (Test):
File: nons (12).java, Predicted Label: 1, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (1).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (32).java, Predicted Label: 1, True Label: 0
Precision: 0.7999999999999999, Recall: 0.6666666666666666, F-score: 0.6249999999999999
Fold 2 (Test):
File: nons (27).java, Predicted Label: 1, True Label: 0
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (40).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 0, True Label: 1
File: nons (15).java, Predicted Label: 0, True Label: 0
Precision: 0.6, Recall: 0.6, F-score: 0.6
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (11).java, Predicted Label: 0, True Label: 0
File: nons (26).java, Predicted Label: 0, True Label: 0
File: nons (21)

In [39]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin10.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin10.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin10.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 54/54 [04:37<00:00,  5.14s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 0, True Label: 0
File: singleton (17).java, Predicted Label: 1, True Label: 1
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 2 (Test):
File: nons (68).java, Predicted Label: 0, True Label: 0
File: singleton (16).java, Predicted Label: 0, True Label: 1
File: nons (26).java, Predicted Label: 1, True Label: 0
File: singleton (4).java, Predicted Label: 0, True Label: 1
File: nons (46).java, Predicted Label: 0, True Label: 0
File: singleton (2).java, Predicted Label: 1, True Label: 1
Precision: 0.5, Recall: 0.5, F-score: 0.48571428571428577
Fold 3 (Test):
File: singleton (14).java, Predicted Label: 1, True Label: 1
File: nons (3).java, Predicted Label: 0, True Label: 0
File: singleton (1).java, Predicted Labe

In [40]:
#Singleton using different settings

In [41]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin11.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin11.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin11.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 50/50 [03:23<00:00,  4.07s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (18).java, Predicted Label: 1, True Label: 0
File: nons (22).java, Predicted Label: 1, True Label: 0
File: singleton (11).java, Predicted Label: 1, True Label: 1
File: singleton (12).java, Predicted Label: 0, True Label: 1
File: nons (39).java, Predicted Label: 0, True Label: 0
Precision: 0.4333333333333333, Recall: 0.4, F-score: 0.4
Fold 2 (Test):
File: nons (12).java, Predicted Label: 1, True Label: 0
File: singleton (15).java, Predicted Label: 1, True Label: 1
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: nons (13).java, Predicted Label: 1, True Label: 0
File: nons (23).java, Predicted Label: 1, True Label: 0
Precision: 0.16, Recall: 0.4, F-score: 0.2285714285714286
Fold 3 (Test):
File: singleton (9).java, Predicted Label: 1, True Label: 1
File: nons (63).java, Predicted Label: 0, True Label: 0
File: nons (8).java, Predicted Label: 1, True Label: 0
File: singleton (22).java, Predicted Label: 1, True Label: 1
File: nons (60).java, Predict

In [None]:
#Singleton with different settings

In [42]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin12.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin12.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin12.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:28<00:00,  4.09s/it]


Fold 1 (Test):
File: nons (12).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (2).java, Predicted Label: 0, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 2 (Test):
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (4).java, Predicted Label: 0, True Label: 0
File: nons (21).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (15).java, Predicted Label: 0, True Label: 0
Precision: 1.0, Recall: 1.0, F-score: 1.0
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (26).java, Predicted Label: 1, True Label: 0
File: nons (8).java, Predicted Label: 0, True Label: 0
File: nons (13).java, Predicted Label: 0, True Label: 0
File: 

In [43]:
#Singleton using different settings

In [44]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'singleton'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "singleton" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/sin13.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/sin13.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/sin13.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 51/51 [03:23<00:00,  4.00s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 (Test):
File: nons (54).java, Predicted Label: 0, True Label: 0
File: singleton (10).java, Predicted Label: 1, True Label: 1
File: nons (17).java, Predicted Label: 1, True Label: 0
File: singleton (8).java, Predicted Label: 1, True Label: 1
File: singleton (21).java, Predicted Label: 1, True Label: 1
File: nons (23).java, Predicted Label: 1, True Label: 0
Precision: 0.7999999999999999, Recall: 0.6666666666666666, F-score: 0.6249999999999999
Fold 2 (Test):
File: singleton (24).java, Predicted Label: 1, True Label: 1
File: nons (68).java, Predicted Label: 0, True Label: 0
File: nons (67).java, Predicted Label: 0, True Label: 0
File: singleton (6).java, Predicted Label: 1, True Label: 1
File: nons (32).java, Predicted Label: 1, True Label: 0
Precision: 0.8666666666666666, Recall: 0.8, F-score: 0.8
Fold 3 (Test):
File: singleton (18).java, Predicted Label: 1, True Label: 1
File: nons (26).java, Predicted Label: 1, True Label: 0
File: nons (66).java, Predicted Label: 1, True Label: 0

In [45]:
#Abstract Factory with different settings

In [47]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "abstractfactory" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/af1.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/af1.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/af1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 33/33 [02:42<00:00,  4.93s/it]

Fold 1 (Test):
File: nonab (72).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (67).java, Predicted Label: 0, True Label: 0
File: nonab (12).java, Predicted Label: 0, True Label: 0
File: abstractfactory (6).java, Predicted Label: 0, True Label: 1
Precision: 0.8857142857142858, Recall: 0.8571428571428571, F-score: 0.8507936507936508
Fold 2 (Test):
File: nonab (11).java, Predicted Label: 0, True Label: 0
File: nonab (1).java, Predicted Label: 1, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
File: nonab (74).java, Predicted Label: 0, True Label: 0
File: nonab (9).java, Predicted Label: 0, True Label: 0
Precision: 0.8928571428571429, Re




In [48]:
#Abstract Factory using different settings

In [49]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "abstractfactory" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/af2.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/af2.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/af2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 32/32 [02:20<00:00,  4.38s/it]

Fold 1 (Test):
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: abstractfactory (2).java, Predicted Label: 1, True Label: 1
File: nonab (2).java, Predicted Label: 0, True Label: 0
File: abstractfactory (16).java, Predicted Label: 1, True Label: 1
File: nonab (6).java, Predicted Label: 0, True Label: 0
File: nonab (16).java, Predicted Label: 0, True Label: 0
File: abstractfactory (6).java, Predicted Label: 0, True Label: 1
Precision: 0.7142857142857143, Recall: 0.7142857142857143, F-score: 0.7142857142857143
Fold 2 (Test):
File: nonab (5).java, Predicted Label: 0, True Label: 0
File: nonab (10).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 1, True Label: 1
File: abstractfactory (3).java, Predicted Label: 1, True Label: 1
File: nonab (12).java, Predicted Label: 0, True Label: 0
File: abstractfactory (14).java, Predicted Label: 0, True Label: 1
Precision: 0.8928571428




In [50]:
#Abstract Factory using different settings

In [52]:
import os
import pandas as pd
import os
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
# Define the path to the folder containing your Java programs
java_programs_folder = 'abstractfactory'  # Replace with the actual folder path

# Initialize lists to store program names, labels, and contents
program_names = []
labels = []
contents = []

# Function to label programs as positive (1) or negative (0) based on the file name
def label_program(file_name):
    return 1 if "abstractfactory" in file_name.lower() else 0

# Load Java programs from the folder and classify them as positive or negative
for program_file in os.listdir(java_programs_folder):
    file_path = os.path.join(java_programs_folder, program_file)

    # Check if the item is a file, not a directory
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='iso-8859-1') as f:
            program_content = f.read()
            program_names.append(program_file)
            labels.append(label_program(program_file))
            contents.append(program_content)

# Create a DataFrame to store the data
data = {'File Name': program_names, 'Label': labels, 'Content': contents}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_path = 'embeddings/af3.csv'  # Replace with the desired CSV file path
df.to_csv(csv_path, index=False)

print(f"Labels and contents saved to {csv_path}")


# Define the path to the CSV file containing program names, labels, and content
csv_path = 'embeddings/af3.csv'  # Replace with the actual CSV file path

# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

# Load the DataFrame from the CSV file
df = pd.read_csv(csv_path)

# Function to calculate embeddings from text
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Calculate embeddings for each program by considering embeddings of individual lines
program_embeddings = []
for content in tqdm(df['Content']):
    lines = content.split('\n')
    line_embeddings = [get_embeddings(line) for line in lines if line.strip()]
    program_embeddings.append(np.mean(line_embeddings, axis=0))

# Convert program_embeddings to NumPy array
program_embeddings = np.vstack(program_embeddings)

# Define the number of folds for cross-validation
n_splits = 2
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Perform k-fold cross-validation
for fold, (train_index, test_index) in enumerate(skf.split(program_embeddings, df['Label'])):
    X_train, X_test = program_embeddings[train_index], program_embeddings[test_index]
    y_train, y_test = df['Label'].iloc[train_index], df['Label'].iloc[test_index]

    # Apply k-nearest neighbor classification
    n_neighbors = 3  # Number of neighbors to consider
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)

    # Predictions on the test set
    test_predictions = knn.predict(X_test)
    
    # Calculate precision, recall, and f-score for the current fold
    precision = precision_score(y_test, test_predictions, average='weighted')
    recall = recall_score(y_test, test_predictions, average='weighted')
    f1 = f1_score(y_test, test_predictions, average='weighted')

    # Print predictions and performance measures only if precision, recall, and f-score are >= 70%
    if precision >= 0.1 and recall >= 0.1 and f1 >= 0.1:
        print(f"Fold {fold + 1} (Test):")
        for i in range(len(test_predictions)):
            print(f"File: {df['File Name'].iloc[test_index[i]]}, Predicted Label: {test_predictions[i]}, True Label: {y_test.iloc[i]}")
        
        print(f"Precision: {precision}, Recall: {recall}, F-score: {f1}")



Labels and contents saved to embeddings/af3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 32/32 [03:27<00:00,  6.47s/it]

Fold 1 (Test):
File: nonab (11).java, Predicted Label: 1, True Label: 0
File: nonab (48).java, Predicted Label: 0, True Label: 0
File: abstractfactory (2).java, Predicted Label: 0, True Label: 1
File: abstractfactory (16).java, Predicted Label: 0, True Label: 1
File: nonab (8).java, Predicted Label: 0, True Label: 0
File: abstractfactory (4).java, Predicted Label: 1, True Label: 1
File: abstractfactory (5).java, Predicted Label: 0, True Label: 1
File: nonab (3).java, Predicted Label: 0, True Label: 0
File: abstractfactory (3).java, Predicted Label: 0, True Label: 1
File: nonab (4).java, Predicted Label: 0, True Label: 0
File: abstractfactory (8).java, Predicted Label: 0, True Label: 1
File: abstractfactory (14).java, Predicted Label: 0, True Label: 1
File: nonab (63).java, Predicted Label: 0, True Label: 0
File: nonab (17).java, Predicted Label: 1, True Label: 0
File: nonab (18).java, Predicted Label: 0, True Label: 0
File: abstractfactory (6).java, Predicted Label: 0, True Label: 1
Pr


