In [1]:
import os
import subprocess

In [2]:
LANG_SRC = "python"
LANG_DST = "java"

MODEL_TYPE = "roberta"
PRETRAINED_MODEL = "microsoft/unixcoder-base"

N_EPOCHS = 15
LEARNING_RATE = 5e-5

BEAM_SIZE = 10
MAX_SOURCE_LEN = 256
MAX_TARGET_LEN = 128

BATCH_SIZE_EVAL = 256
BATCH_SIZE_TRAIN = 256

MINI_DATASET_SIZE = 4096
MINI_MODE_ENABLED = False

FILENAME_JSONL_TEST = "test.jsonl"
FILENAME_JSONL_TRAIN = "train.jsonl"
FILENAME_JSONL_VALID = "valid.jsonl"

FILENAME_JSONL_TEST_MINI = "test-mini.jsonl"
FILENAME_JSONL_TRAIN_MINI = "train-mini.jsonl"
FILENAME_JSONL_VALID_MINI = "valid-mini.jsonl"

FILENAME_CSV_BLEU_SCORES = "bleu_scores.csv"
FILENAME_CSV_EVAL_LOSSES = "eval_losses.csv"
FILENAME_CSV_TRAIN_LOSSES = "train_losses.csv"
FILENAME_TXT_BLEU_SCORE_TEST = "bleu_score.test"

In [3]:
notebook_path = os.path.abspath(os.curdir)
root_path = os.path.dirname(notebook_path)

repo_path = os.path.join(root_path, "repos", "CodeXGLUE")
task_path = os.path.join(repo_path, "Code-Text", "code-to-text")

src_root_path = os.path.join(root_path, "src", "python")
src_task_path = os.path.join(src_root_path, "codexglue", "summarization")

code_path = os.path.join(task_path, "code")
dataset_path = os.path.join(task_path, "dataset")
evaluator_path = os.path.join(task_path, "evaluator")

model_path = os.path.join(notebook_path, "model")
src_notebook_path = os.path.join(notebook_path, "src")

model_name = "{dataset}-{task}-{model_type}-{pretrained_model}-{lang}-n_epochs={n_epochs}-lr={lr}" \
    .format(
        lang=LANG_SRC
        , lr=LEARNING_RATE
        , n_epochs=N_EPOCHS
        , dataset="codexglue"
        , task="summarization"
        , model_type=MODEL_TYPE
        , pretrained_model=PRETRAINED_MODEL.replace("/", "_").replace("-", "_")
    )

In [4]:
subprocess.check_call(
    [
        "python"
        
        , os.path.join(code_path, "run.py")
        
        , "--model_type", MODEL_TYPE
        , "--model_name_or_path", PRETRAINED_MODEL
        , "--output_dir", model_path
        , "--load_model_path", os.path.join(model_path, "checkpoint-best-bleu", "pytorch_model.bin")

        , "--beam_size", str(BEAM_SIZE)
        , "--max_source_length", str(MAX_SOURCE_LEN)
        , "--max_target_length", str(MAX_TARGET_LEN)

        , "--do_test"
        , "--eval_batch_size", str(BATCH_SIZE_EVAL)
        , "--test_filename", os.path.join(
            dataset_path
            , LANG_DST
            , FILENAME_JSONL_TEST if not MINI_MODE_ENABLED else FILENAME_JSONL_TEST_MINI
        )

        , "--bleu_score_test_txt_filename", os.path.join(model_path, FILENAME_TXT_BLEU_SCORE_TEST)
    ]
)

05/08/2022 03:44:29 - INFO - __main__ -   Namespace(adam_epsilon=1e-08, beam_size=10, bleu_score_test_txt_filename='/home/user/CS5814-project/codexglue-summarization-cross-python-on-java/model/bleu_score.test', bleu_scores_csv_filename=None, config_name='', dev_filename=None, do_eval=False, do_lower_case=False, do_test=True, do_train=False, eval_batch_size=256, eval_losses_csv_filename=None, eval_steps=-1, gradient_accumulation_steps=1, learning_rate=5e-05, load_model_path='/home/user/CS5814-project/codexglue-summarization-cross-python-on-java/model/checkpoint-best-bleu/pytorch_model.bin', local_rank=-1, max_grad_norm=1.0, max_source_length=256, max_steps=-1, max_target_length=128, model_name_or_path='microsoft/unixcoder-base', model_type='roberta', no_cuda=False, num_train_epochs=3, output_dir='/home/user/CS5814-project/codexglue-summarization-cross-python-on-java/model', seed=42, test_filename='/home/user/CS5814-project/repos/CodeXGLUE/Code-Text/code-to-text/dataset/java/test.jsonl',

0

In [5]:
with open(os.path.join(model_path, FILENAME_TXT_BLEU_SCORE_TEST), 'r') as bleu_score_test_file:
    print("Test Bleu score: ", bleu_score_test_file.readline())

Test Bleu score:  16.43


In [6]:
print("Top 10 test data ground truth summaries of code in {lang_dst} language:\n".format(lang_dst=LANG_DST))

with open(os.path.join(model_path, "test_0.gold"), mode="r") as f:
    for _ in range(10):
        print(f.readline())

Top 10 test data ground truth summaries of code in java language:

0	Makes sure the fast - path emits in order .

1	Wraps an ObservableSource into an Observable if not already an Observable .

2	Returns an Observable that emits the events emitted by source ObservableSource in a sorted order based on a specified comparison function .

3	Child Observers will observe the events of the ConnectableObservable on the specified scheduler .

4	Creates an UnicastProcessor with the given internal buffer capacity hint .

5	Creates an UnicastProcessor with the given internal buffer capacity hint and a callback for the case when the single Subscriber cancels its subscription .

6	Tries to subscribe to a possibly Callable source s mapped Publisher .

7	Maps a scalar value into a Publisher and emits its values .

8	Removes all handlers and resets to default behavior .

9	Wraps a CompletableSource into a Maybe .



In [7]:
print("Top 10 test data predicted summaries of code in {lang_dst} language using the trained {model_name} model:\n".format(lang_dst=LANG_DST, model_name=model_name))

with open(os.path.join(model_path, "test_0.output"), mode="r") as f:
    for _ in range(10):
        print(f.readline())

Top 10 test data predicted summaries of code in java language using the trained codexglue-summarization-roberta-microsoft_unixcoder_base-python-n_epochs=15-lr=5e-05 model:

0	Fast path emit a value .

1	Return an observable from an observable source .

2	Returns an Observable with the specified sort function .

3	Turns an observable into an observable .

4	Create an unicast processor .

5	Create a new unicast processor .

6	Return True if the source is a scalar map .

7	Returns a new flowable with a scalar map .

8	Reset the state .

9	Returns a Maybe from a completable .

