In [None]:
import os
import subprocess

In [None]:
LANG_SRC = "python"
LANG_DST = "java"

MODEL_TYPE = "roberta"
PRETRAINED_MODEL = "microsoft/unixcoder-base"

N_EPOCHS = 15
LEARNING_RATE = 5e-5

BEAM_SIZE = 10
MAX_SOURCE_LEN = 256
MAX_TARGET_LEN = 128

BATCH_SIZE_EVAL = 256
BATCH_SIZE_TRAIN = 256

MINI_DATASET_SIZE = 4096
MINI_MODE_ENABLED = False

FILENAME_JSONL_TEST = "test.jsonl"
FILENAME_JSONL_TRAIN = "train.jsonl"
FILENAME_JSONL_VALID = "valid.jsonl"

FILENAME_JSONL_TEST_MINI = "test-mini.jsonl"
FILENAME_JSONL_TRAIN_MINI = "train-mini.jsonl"
FILENAME_JSONL_VALID_MINI = "valid-mini.jsonl"

FILENAME_CSV_BLEU_SCORES = "bleu_scores.csv"
FILENAME_CSV_EVAL_LOSSES = "eval_losses.csv"
FILENAME_CSV_TRAIN_LOSSES = "train_losses.csv"
FILENAME_TXT_BLEU_SCORE_TEST = "bleu_score.test"

In [None]:
notebook_path = os.path.abspath(os.curdir)
root_path = os.path.dirname(notebook_path)

repo_path = os.path.join(root_path, "repos", "CodeXGLUE")
task_path = os.path.join(repo_path, "Code-Text", "code-to-text")

src_root_path = os.path.join(root_path, "src", "python")
src_task_path = os.path.join(src_root_path, "codexglue", "summarization")

code_path = os.path.join(task_path, "code")
dataset_path = os.path.join(task_path, "dataset")
evaluator_path = os.path.join(task_path, "evaluator")

model_path = os.path.join(notebook_path, "model")
src_notebook_path = os.path.join(notebook_path, "src")

model_name = "{dataset}-{task}-{model_type}-{pretrained_model}-{lang}-n_epochs={n_epochs}-lr={lr}" \
    .format(
        lang=LANG_SRC
        , lr=LEARNING_RATE
        , n_epochs=N_EPOCHS
        , dataset="codexglue"
        , task="summarization"
        , model_type=MODEL_TYPE
        , pretrained_model=PRETRAINED_MODEL.replace("/", "_").replace("-", "_")
    )

In [None]:
subprocess.check_call(
    [
        "python"
        
        , os.path.join(code_path, "run.py")
        
        , "--model_type", MODEL_TYPE
        , "--model_name_or_path", PRETRAINED_MODEL
        , "--output_dir", model_path
        , "--load_model_path", os.path.join(model_path, "checkpoint-best-bleu", "pytorch_model.bin")

        , "--beam_size", str(BEAM_SIZE)
        , "--max_source_length", str(MAX_SOURCE_LEN)
        , "--max_target_length", str(MAX_TARGET_LEN)

        , "--do_test"
        , "--eval_batch_size", str(BATCH_SIZE_EVAL)
        , "--test_filename", os.path.join(
            dataset_path
            , LANG_DST
            , FILENAME_JSONL_TEST if not MINI_MODE_ENABLED else FILENAME_JSONL_TEST_MINI
        )

        , "--bleu_score_test_txt_filename", os.path.join(model_path, FILENAME_TXT_BLEU_SCORE_TEST)
    ]
)

In [None]:
with open(os.path.join(model_path, FILENAME_TXT_BLEU_SCORE_TEST), 'r') as bleu_score_test_file:
    print("Test Bleu score: ", bleu_score_test_file.readline())

In [None]:
print("Top 10 test data ground truth summaries of code in {lang_dst} language:\n".format(lang_dst=LANG_DST))

with open(os.path.join(model_path, "test_0.gold"), mode="r") as f:
    for _ in range(10):
        print(f.readline())

In [None]:
print("Top 10 test data predicted summaries of code in {lang_dst} language using the trained {model_name} model:\n".format(lang_dst=LANG_DST, model_name=model_name))

with open(os.path.join(model_path, "test_0.output"), mode="r") as f:
    for _ in range(10):
        print(f.readline())