In [1]:
import os
import subprocess

In [2]:
LANG_SRC = "java"
LANG_DST = "go"

MODEL_TYPE = "roberta"
PRETRAINED_MODEL = "microsoft/unixcoder-base"

N_EPOCHS = 15
LEARNING_RATE = 5e-5

BEAM_SIZE = 10
MAX_SOURCE_LEN = 256
MAX_TARGET_LEN = 128

BATCH_SIZE_EVAL = 256
BATCH_SIZE_TRAIN = 256

MINI_DATASET_SIZE = 4096
MINI_MODE_ENABLED = False

FILENAME_JSONL_TEST = "test.jsonl"
FILENAME_JSONL_TRAIN = "train.jsonl"
FILENAME_JSONL_VALID = "valid.jsonl"

FILENAME_JSONL_TEST_MINI = "test-mini.jsonl"
FILENAME_JSONL_TRAIN_MINI = "train-mini.jsonl"
FILENAME_JSONL_VALID_MINI = "valid-mini.jsonl"

FILENAME_CSV_BLEU_SCORES = "bleu_scores.csv"
FILENAME_CSV_EVAL_LOSSES = "eval_losses.csv"
FILENAME_CSV_TRAIN_LOSSES = "train_losses.csv"
FILENAME_TXT_BLEU_SCORE_TEST = "bleu_score.test"

In [3]:
notebook_path = os.path.abspath(os.curdir)
root_path = os.path.dirname(notebook_path)

repo_path = os.path.join(root_path, "repos", "CodeXGLUE")
task_path = os.path.join(repo_path, "Code-Text", "code-to-text")

src_root_path = os.path.join(root_path, "src", "python")
src_task_path = os.path.join(src_root_path, "codexglue", "summarization")

code_path = os.path.join(task_path, "code")
dataset_path = os.path.join(task_path, "dataset")
evaluator_path = os.path.join(task_path, "evaluator")

model_path = os.path.join(notebook_path, "model")
src_notebook_path = os.path.join(notebook_path, "src")

model_name = "{dataset}-{task}-{model_type}-{pretrained_model}-{lang}-n_epochs={n_epochs}-lr={lr}" \
    .format(
        lang=LANG_SRC
        , lr=LEARNING_RATE
        , n_epochs=N_EPOCHS
        , dataset="codexglue"
        , task="summarization"
        , model_type=MODEL_TYPE
        , pretrained_model=PRETRAINED_MODEL.replace("/", "_").replace("-", "_")
    )

In [4]:
subprocess.check_call(
    [
        "python"
        
        , os.path.join(code_path, "run.py")
        
        , "--model_type", MODEL_TYPE
        , "--model_name_or_path", PRETRAINED_MODEL
        , "--output_dir", model_path
        , "--load_model_path", os.path.join(model_path, "checkpoint-best-bleu", "pytorch_model.bin")

        , "--beam_size", str(BEAM_SIZE)
        , "--max_source_length", str(MAX_SOURCE_LEN)
        , "--max_target_length", str(MAX_TARGET_LEN)

        , "--do_test"
        , "--eval_batch_size", str(BATCH_SIZE_EVAL)
        , "--test_filename", os.path.join(
            dataset_path
            , LANG_DST
            , FILENAME_JSONL_TEST if not MINI_MODE_ENABLED else FILENAME_JSONL_TEST_MINI
        )

        , "--bleu_score_test_txt_filename", os.path.join(model_path, FILENAME_TXT_BLEU_SCORE_TEST)
    ]
)

05/08/2022 08:10:10 - INFO - __main__ -   Namespace(adam_epsilon=1e-08, beam_size=10, bleu_score_test_txt_filename='/home/user/CS5814-project/codexglue-summarization-cross-java-on-python/model/bleu_score.test', bleu_scores_csv_filename=None, config_name='', dev_filename=None, do_eval=False, do_lower_case=False, do_test=True, do_train=False, eval_batch_size=256, eval_losses_csv_filename=None, eval_steps=-1, gradient_accumulation_steps=1, learning_rate=5e-05, load_model_path='/home/user/CS5814-project/codexglue-summarization-cross-java-on-python/model/checkpoint-best-bleu/pytorch_model.bin', local_rank=-1, max_grad_norm=1.0, max_source_length=256, max_steps=-1, max_target_length=128, model_name_or_path='microsoft/unixcoder-base', model_type='roberta', no_cuda=False, num_train_epochs=3, output_dir='/home/user/CS5814-project/codexglue-summarization-cross-java-on-python/model', seed=42, test_filename='/home/user/CS5814-project/repos/CodeXGLUE/Code-Text/code-to-text/dataset/go/test.jsonl', t

0

In [5]:
with open(os.path.join(model_path, FILENAME_TXT_BLEU_SCORE_TEST), 'r') as bleu_score_test_file:
    print("Test Bleu score: ", bleu_score_test_file.readline())

Test Bleu score:  13.45


In [8]:
print("Top 10 test data ground truth summaries of code in {lang_dst} language:\n".format(lang_dst=LANG_DST))

with open(os.path.join(model_path, "test_0.gold"), mode="r") as f:
    for _ in range(20):
        print(f.readline())

Top 10 test data ground truth summaries of code in go language:

0	NewSTM initiates a new STM instance using serializable snapshot isolation by default .

1	first returns the store revision from the first fetch

2	cmps returns a cmp list testing no writes have happened past rev

3	NewSTMRepeatable is deprecated .

4	NewSTMSerializable is deprecated .

5	NewSTMReadCommitted is deprecated .

6	NewCertPool creates x509 certPool with provided CA files .

7	NewCert generates TLS cert by using the given cert key and parse function .

8	Pause pauses the peer . The peer will simply drops all incoming messages without returning an error .

9	Resume resumes a paused peer .

10	pick picks a chan for sending the given message . The picked chan and the picked chan string name are returned .

11	post posts the given request . It returns nil when request is sent out and processed successfully .

12	newTxnResp allocates a txn response for a txn request given a path .

13	applyCompare applies the compa

In [9]:
print("Top 10 test data predicted summaries of code in {lang_dst} language using the trained {model_name} model:\n".format(lang_dst=LANG_DST, model_name=model_name))

with open(os.path.join(model_path, "test_0.output"), mode="r") as f:
    for _ in range(20):
        print(f.readline())

Top 10 test data predicted summaries of code in go language using the trained codexglue-summarization-roberta-microsoft_unixcoder_base-java-n_epochs=15-lr=5e-05 model:

0	Executes the given STM with the given options .

1	Returns the first revision of the set .

2	Returns the comparator used to compare the given revision .

3	Applies an operation to a STMn response .

4	Apply a STMn response to a STM .

5	Apply an operation on a STMn response .

6	Create a cert pool from a list of PEM files .

7	Parses a certificate from a file .

8	Pause the peer .

9	Resume the peer .

10	Picks a message .

11	Send a POST request .

12	Get txn response .

13	Compares a CompareView .

14	Compacts the given revision

15	Create a priority queue .

16	Enqueues a value in priority queue .

17	Returns a statistics for the leader

18	Count success .

19	Increment the follower stats

