In [100]:
import os
import random
import torch
import pandas as pd
from evaluate import load 
from IPython.display import display
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer

Step 1: read all files into memory

In [101]:
def read_files(directory_path: str, filenames: list[str]) -> list[str]:
	file_contents = []

	for filename in filenames:
		file_path = os.path.join(directory_path, filename)

		with open(file_path, 'r') as file:
			file_contents.append(file.read())

	return file_contents


in_directory_path = "../data"
filenames = os.listdir(in_directory_path)
contents = read_files(in_directory_path, filenames)
n_files = len(contents)

print(f"{n_files} file(s) have been read in directory {in_directory_path}")


2 file(s) have been read in directory ../data


Step 2: split each file into three parts multiple times

`split_data` will be of dimensions `[n_files x n_samples x 3]`

In [102]:
def split(n_samples: int, min_missing_chars: int, max_missing_chars: int) -> list[list[list[str]]]:
	split_data = []

	for file_content in contents:
		file_splits = []
		for _ in range(n_samples):
			middle_len = min(len(file_content), random.randint(min_missing_chars, max_missing_chars))
			middle_start = random.randint(0, len(file_content) - 1 - middle_len)
			middle_end = middle_start + middle_len

			prefix = file_content[:middle_start]
			middle = file_content[middle_start:middle_end]
			suffix = file_content[middle_end:]

			file_splits.append([prefix, middle, suffix])
		split_data.append(file_splits)
	return split_data


n_samples = 1  # Number of different splits obtained per file. Total number of samples = n_files * n_samples
max_missing_chars = 50  # The maximum number of characters that need to be completed (i.e. the middle part)
min_missing_chars = 5
random.seed(45)  # For reproducible results

split_data = split(n_samples, min_missing_chars, max_missing_chars)

print(f"split_data has been initialized with {n_samples} sample(s) per file")

split_data has been initialized with 1 sample(s) per file


Step 3.1: Load the model

In [103]:
def hf_login() -> None:
	load_dotenv()
	api_key = os.getenv('HUGGINGFACE_TOKEN')
	login(api_key)


def load_model(model_name: str, device: str):
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token
	model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16).to(device)
	return model, tokenizer


hf_login()
device = "cpu"
model_name = "bigcode/starcoderbase-1b"
model, tokenizer = load_model(model_name, device)

Step 3.2: Use the model to generate suggestions

In [104]:
def suggest(model, tokenizer, prefix: str, suffix: str) -> str:
	PREFIX_TAG = "<fim_prefix>"
	SUFFIX_TAG = "<fim_suffix>"
	MIDDLE_TAG = "<fim_middle>"
	EOT_TAG = "<|endoftext|>"
	
	input_text = f"{PREFIX_TAG}{prefix}{SUFFIX_TAG}{suffix}{MIDDLE_TAG}"
	inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
	outputs = model.generate(inputs, max_new_tokens=max_missing_chars, pad_token_id=tokenizer.pad_token_id)
	result = tokenizer.decode(outputs[0])
	return result.split(MIDDLE_TAG)[-1].replace(EOT_TAG, '')


In [106]:
raw_suggestions = []

for i in range(n_files):
	file_suggestions = []
	for j in range(n_samples):
		suggestion = suggest(model, tokenizer, split_data[i][j][0], split_data[i][j][2])
		file_suggestions.append(suggestion)
	raw_suggestions.append(file_suggestions)


Step 4: Collect the results in a CSV file and as source code

In [145]:
def write_table(table_directory_path: str, table_name: str, table: pd.DataFrame) -> None:
	if table_directory_path[-1] != "/":
		table_directory_path += "/"
	full_path = f"{table_directory_path}{table_name}"
	table.to_csv(full_path, index=False)

In [146]:
def get_results_table() -> pd.DataFrame:
	data = []
	for i in range(n_files):
		file_name = filenames[i]
		for j in range(n_samples):
			row = {
				"File": file_name,
				"Original": split_data[i][j][1],
				"Suggestion": raw_suggestions[i][j]
			}
			data.append(row)

	return pd.DataFrame(data)


table_directory_path = "../out/tables"
results_table = get_results_table()
display(results_table)
write_table(table_directory_path, "results.csv", results_table)


Unnamed: 0,File,Original,Suggestion
0,animal.py,f):\n\t\tself.weight_kg =,f):\n\t\tself.weight_kg = 10\n\t\tself.sound =...
1,add.py,"sum of two numbers\ndef add(a, b):\n\t","the sum of two numbers\ndef add(a, b):\n"


In [115]:
def append_number_to_filename(filename: str, number: int) -> str:
	name, extension = filename.split(".")
	return f"{name}_{number}.{extension}"


def reconstruct_data(prefix: str, middle: str, suffix: str) -> str:
	return prefix + middle + suffix


def write_file(directory_path: str, filename: str, data: str) -> None:
	file_path = os.path.join(directory_path, filename)

	with open(file_path, 'w') as file:
		file.write(data)


def write_files(directory_path: str) -> None:
	for i in range(n_files):
		for j in range(n_samples):
			filename = append_number_to_filename(filenames[i], j)
			data = reconstruct_data(split_data[i][j][0], raw_suggestions[i][j], split_data[i][j][2])
			write_file(directory_path, filename, data)


out_directory_path = "../out"
write_files(out_directory_path)


Step 5: Calculate metrics for the suggestions

In [134]:
def exact_match(original_data: list[str], suggested_data: list[str]) -> float:
	return load("exact_match").compute(predictions=suggested_data, references=original_data)["exact_match"].item()

def chrf(original_data: list[str], suggested_data: list[str]) -> float:
	return load("chrf").compute(predictions=suggested_data, references=original_data)


In [148]:
def flatten_results(split_data: list[list[list[str]]], suggestions: list[list[str]]) \
		-> tuple[list[str], list[str]]:
	original_data = []
	suggested_data = []

	for i in range(n_files):
		for j in range(n_samples):
			original_data.append(split_data[i][j][1])
			suggested_data.append(suggestions[i][j])

	return original_data, suggested_data


def calculate_metrics(original_data: list[str], suggested_data: list[str]) -> dict[str, any]:
	metrics = dict()

	metrics["exact_match"] = exact_match(original_data, suggested_data)
	metrics["chrf"] = chrf(original_data, suggested_data)

	return metrics

original_data, suggested_data = flatten_results(split_data, raw_suggestions)
metrics = calculate_metrics(original_data, suggested_data)
print(metrics)
write_table(table_directory_path, "metrics.csv", pd.DataFrame(metrics))

{'exact_match': 0.0, 'chrf': {'score': 74.85498662263461, 'char_order': 6, 'word_order': 0, 'beta': 2}}
