In [None]:
import os
import glob
import shutil
import zipfile
import subprocess

In [None]:
LANG = "python"
LEARNING_RATE = 5e-5
BATCH_SIZE_EVAL = 32
BATCH_SIZE_TRAIN = 32
N_EPOCHS = 1
BEAM_SIZE = 10
MAX_SOURCE_LEN = 256
MAX_TARGET_LEN = 128

In [None]:
root_path = os.path.abspath(os.curdir)
repo_path = os.path.join(root_path, "repos", "CodeXGLUE")
task_path = os.path.join(repo_path, "Code-Text", "code-to-text")

code_path = os.path.join(task_path, "code")
model_path = os.path.join(task_path, "model")
dataset_path = os.path.join(task_path, "dataset")
evaluator_path = os.path.join(task_path, "evaluator")

In [None]:
if not os.path.exists(repo_path):
    os.makedirs(repo_path, exist_ok=True)
    
    subprocess.check_call(
        [
            "git"
            , "clone"
            , "https://github.com/microsoft/CodeXGLUE"
            , repo_path
        ]
    )

In [None]:
if not os.path.exists(dataset_path):
    with zipfile.ZipFile(os.path.join(task_path, "dataset.zip"), 'r') as dataset_zip_file:
        dataset_zip_file.extractall(task_path)

    for lang in ["python", "java", "ruby", "javascript", "go", "php"]:
        subprocess.check_call(
            [
                "wget"
                , "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{lang}.zip".format(lang=lang)
                , "-P"
                , dataset_path
            ]
        )

        with zipfile.ZipFile(os.path.join(dataset_path, "{lang}.zip".format(lang=lang)), 'r') as lang_zip_file:
            lang_zip_file.extractall(dataset_path)
    
    os.chdir(dataset_path)
    subprocess.check_call(
        [
            "python"
            , os.path.join(dataset_path, "preprocess.py")
        ]
    )
    os.chdir(root_path)

    [os.remove(f) for f in glob.glob(os.path.join(dataset_path, "*.zip"))]
    [os.remove(f) for f in glob.glob(os.path.join(dataset_path, "*.pkl"))]
    [shutil.rmtree(dir, ignore_errors=True) for dir in glob.glob(os.path.join(dataset_path, "*/final"))]

In [None]:
# os.chdir(evaluator_path)
subprocess.check_call(
    [
        "python"
        , os.path.join(evaluator_path, "evaluator.py")
        , os.path.join(evaluator_path, "reference.txt")
        , "<"
        , os.path.join(evaluator_path, "predictions.txt")
    ]
)
# os.chdir(root_path)

In [None]:
os.chdir(code_path)

subprocess.check_call(
    [
        "python"
        , "run.py"
        
        , "--model_type", "roberta"
        , "--model_name_or_path", "microsoft/codebert-base"
        , "--output_dir", os.path.join(model_path, LANG)

        , "--do_train"
        , "--train_batch_size", str(BATCH_SIZE_TRAIN)
        , "--train_filename", os.path.join(dataset_path, LANG, "train.jsonl")

        , "--do_eval"
        , "--eval_batch_size", str(BATCH_SIZE_EVAL)
        , "--dev_filename", os.path.join(dataset_path, LANG, "valid.jsonl")

        , "--beam_size", str(BEAM_SIZE)
        , "--num_train_epochs", str(N_EPOCHS)
        , "--learning_rate", str(LEARNING_RATE)
        , "--max_source_length", str(MAX_SOURCE_LEN)
        , "--max_target_length", str(MAX_TARGET_LEN)
    ]
)

os.chdir(root_path)