In [None]:
import os
import glob
import shutil
import zipfile
import subprocess

In [None]:
root_path = os.path.abspath(os.curdir)
repo_path = os.path.join(root_path, "repos", "CodeXGLUE")
task_path = os.path.join(repo_path, "Code-Text", "code-to-text")

code_path = os.path.join(task_path, "code")
model_path = os.path.join(task_path, "model")
dataset_path = os.path.join(task_path, "dataset")
evaluator_path = os.path.join(task_path, "evaluator")

In [None]:
if not os.path.exists(repo_path):
    os.makedirs(repo_path, exist_ok=True)
    
    subprocess.check_call(
        [
            "git"
            , "clone"
            , "https://github.com/microsoft/CodeXGLUE"
            , repo_path
        ]
    )

In [None]:
if not os.path.exists(dataset_path):
    with zipfile.ZipFile(os.path.join(task_path, "dataset.zip"), 'r') as dataset_zip_file:
        dataset_zip_file.extractall(task_path)

    for pl in ["python", "java", "ruby", "javascript", "go", "php"]:
        subprocess.check_call(
            [
                "wget"
                , "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{pl}.zip".format(pl=pl)
                , "-P"
                , dataset_path
            ]
        )

        with zipfile.ZipFile(os.path.join(dataset_path, "{pl}.zip".format(pl=pl)), 'r') as pl_zip_file:
            pl_zip_file.extractall(dataset_path)
    
    os.chdir(dataset_path)
    subprocess.check_call(
        [
            "python"
            , os.path.join(dataset_path, "preprocess.py")
        ]
    )
    os.chdir(root_path)

    [os.remove(f) for f in glob.glob(os.path.join(dataset_path, "*.zip"))]
    [os.remove(f) for f in glob.glob(os.path.join(dataset_path, "*.pkl"))]
    [shutil.rmtree(dir, ignore_errors=True) for dir in glob.glob(os.path.join(dataset_path, "*/final"))]

In [None]:
# os.chdir(evaluator_path)
subprocess.check_call(
    [
        "python"
        , os.path.join(evaluator_path, "evaluator.py")
        , os.path.join(evaluator_path, "reference.txt")
        , "<"
        , os.path.join(evaluator_path, "predictions.txt")
    ]
)
# os.chdir(root_path)

In [None]:
os.chdir(code_path)

pl="python"
lr=5e-5
batch_size=32
beam_size=10
source_length=256
target_length=128
epochs=1

subprocess.check_call(
    [
        "python"
        , "run.py"
        , "--do_train"
        , "--do_eval"
        , "--model_type", "roberta"
        , "--model_name_or_path", "microsoft/codebert-base"
        , "--train_filename", os.path.join(dataset_path, "python", "train.jsonl")
        , "--dev_filename", os.path.join(dataset_path, "python", "valid.jsonl")
        , "--output_dir", os.path.join(model_path, "python")
        , "--max_source_length", str(source_length)
        , "--max_target_length", str(target_length)
        , "--beam_size", str(beam_size)
        , "--train_batch_size", str(batch_size)
        , "--eval_batch_size", str(batch_size)
        , "--learning_rate", str(lr)
        , "--num_train_epochs", str(epochs)
    ]
)

os.chdir(root_path)