In [21]:
import re

def camel_case_split(identifier):
    matches = re.finditer(
        ".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)",
        identifier,
    )
    return [m.group(0) for m in matches]

def special_split(token):
    #!@#$%^&*()-+?_=,<>/|
    token = re.sub(r"([!@#$%^&*()-+?_=,<>/\.\[\]\:])+", r" \1 ", token)
    return token.split()

def subtokens(in_list):
    good_list = []
    for tok in in_list:
        for subtok in tok.replace("_", " ").split(" "):
            if subtok.strip() != "":
                for subsubtok in special_split(subtok):
                    good_list.extend(camel_case_split(subsubtok))

    return good_list


def clean_name(in_list):
    return subtokens(in_list)


def normalize_subtoken(subtoken):
    normalized = re.sub(
        r"[^\x00-\x7f]",
        r"",  # Get rid of non-ascii
        re.sub(
            r'["\'`]',
            r"",  # Get rid of quotes and comma
            re.sub(
                r"\s+",
                r"",  # Get rid of spaces
                subtoken.replace("\\\n", "")
                .replace("\\\t", "")
                .replace("\\\r", ""),
            ),
        ),
    )

    return normalized.strip()


def tokenizer_code(code):
    code_tokens = list(
        filter(None, [normalize_subtoken(subtok) for subtok in subtokens(code.split())])
    )
    return code_tokens

In [None]:
text = """ 
def :C:NN2d:(CNN=None, second=10, saveable=True, name='cnn', fig_idx=3119362):\n    import matplotlib.pyplot as plt\n    # tl.logging.info(CNN.shape)    # (5, 5, 3, 64)\n    # exit()\n    n_mask = CNN.shape[3]\n    n_row = CNN.shape[0]\n    n_col = CNN.shape[1]\n    n_color = CNN.shape[2]\n    row = int(np.sqrt(n_mask))\n    col = int(np.ceil(n_mask / row))\n    plt.ion()  # active mode\n    fig = plt.figure(fig_idx)\n    count = 1\n    for _ir in range(1, row + 1):\n        for _ic in range(1, col + 1):\n            if count > n_mask:\n                break\n            fig.add_subplot(col, row, count)\n            # tl.logging.info(CNN[:,:,:,count-1].shape, n_row, n_col)   # (5, 1, 32) 5 5\n            # exit()\n            # plt.imshow(\n            #         np.reshape(CNN[count-1,:,:,:], (n_row, n_col)),\n            #         cmap='gray', interpolation=\"nearest\")     # theano\n            if n_color == 1:\n                plt.imshow(np.reshape(CNN[:, :, :, count - 1], (n_row, n_col)), cmap='gray', interpolation=\"nearest\")\n            elif n_color == 3:\n                plt.imshow(\n                    np.reshape(CNN[:, :, :, count - 1], (n_row, n_col, n_color)), cmap='gray', interpolation=\"nearest\"\n                )\n            else:\n                raise Exception(\"Unknown n_color\")\n            plt.gca().xaxis.set_major_locator(plt.NullLocator())  # distable tick\n            plt.gca().yaxis.set_major_locator(plt.NullLocator())\n            count = count + 1\n    if saveable:\n        plt.savefig(name + '.pdf', format='pdf')\n    else:\n        plt.draw()\n        plt.pause(second)
"""
tokenizer_code(text)

create sketch dataset   

In [None]:
splits = ['train', 'test', 'valid']
# splits = ["test"]
import re
import json
import pandas as pd
for split in splits:
    input_file = f"datasets/normalized/csn/{split}.jsonl"
    output_file = f"datasets/normalized/csn/{split}_all.csv"
    replaced_file = f"datasets/transformed/normalized/transforms.Replace/{split}_site_map.json"
    with open(replaced_file) as f:
        replaced_mapping = json.load(f)
    with open(input_file) as f:
        data = [json.loads(l.strip()) for l in f.readlines()]
    result_pandas = list() 
    for el in data:
        sha = el["sha256_hash"]
        source_tokens = el["source_tokens"]
        target_tokens = el["target_tokens"]
        source_code = el["source_code"]
        replaced_map = replaced_mapping[sha]
        # print(replaced_map)
        replace_content = ''
        replace_content_file = f"datasets/transformed/normalized/transforms.Replace/{split}/{sha}.py"
        with open(replace_content_file) as file_poiter:
            replace_content = file_poiter.read().strip()
            # this line => to cut signature of function to gen method name
            replace_content = "\n".join(replace_content.splitlines()[1:])
            replace_content = " ".join(tokenizer_code(replace_content))
            sketch_content = re.sub("REPLACEME\d+", "<UNK>", replace_content)
        result_pandas.append(
            {
                "sha": sha,
                "source_tokens": source_tokens,
                "target_tokens": target_tokens,
                "source_code": source_code,
                "replaced_map": replaced_map,
                "replace_content": replace_content,
                "sketch_content": sketch_content,
            }
        )
    df = pd.DataFrame(result_pandas)
    df["index"] = [i for i in range(df.shape[0])]
    df.to_csv(output_file,index=False)
    print(df.shape)
    # pandas
    # code
    # sketch:
    # replaced: => convert nguoc lai

In [None]:
df.head(2)["sketch_content"]

In [None]:
df = pd.read_csv(output_file)
df.columns

In [None]:
# train scraft model 
!python models/pytorch-seq2seq/train_reproduce.py \
    --train_path "datasets/normalized/csn/train_all.csv" \
    --dev_path "datasets/normalized/csn/valid_all.csv" \
    --expt_name lstm \
    --expt_dir outputs/craft --epochs 10 \
    --src_field_name sketch_content
# edit file models/pytorch-seq2seq/seq2seq/__init__.py

In [None]:
!python3 models/pytorch-seq2seq/gradient_attack_reproduce.py \
	--data_path datasets/normalized/csn/valid_all.csv \
	--expt_dir outputs/craft/lstm \
	--load_checkpoint Best_F1 \
	--save_path outputs/targeted2-valid.json \
	--n_alt_iters 1 \
	--z_init 1 --u_pgd_epochs 1 --z_epsilon 1 --attack_version 1 \
	--u_learning_rate 0.5 --z_learning_rate 0.5 \
	--u_learning_rate 0.5 --smoothing_param 0.01 --vocab_to_use 1 --distinct \
	--targeted_attack \
	--target_label "create entry" --batch_size 16  
	

**merge replace => code input => to training model**

In [None]:
import pandas as pd 
import json 

files = ['train', 'test', 'valid']
# files = ["valid"]

for file in files:
    file_source = f"datasets/normalized/csn/{file}_all.csv"
    df = pd.read_csv(file_source)
    file_mapping = f"outputs/targeted2-{file}-create entry-gradient.json"
    with open(file_mapping) as ff:
        mapping = json.load(ff)["replace_content"]
    result = list()
    for _,row in df.iterrows():
        key = str(row['index'])
        if key not in mapping:
            row['adv_code'] = ''
            result.append(row)
            print(key)
            continue
        adv_map = mapping[key]
        source = row['replace_content']
        for k,v in adv_map.items():
            source = source.replace(k,v)
        # print(source)
        row['adv_code'] = source
        result.append(row)
    df = pd.DataFrame(result)
    df.to_csv(file_source+"adv.csv",index=False)

In [20]:
df.head(1)

Unnamed: 0,sha,source_tokens,target_tokens,source_code,replaced_map,replace_content,sketch_content,index,adv_code
0,be7b7dacaea917c99b53b78eff695a6a4039ba2fc95f8c...,"['(', 'self', ',', 'field', ',', 'default', '=...","['get', '_field']","def get_field(self, field, default=None):\n\n ...","{'@R_1@': ['value', 'transforms.RenameLocalVar...",REPLACEME1 = None if REPLACEME15 = internal-id...,<UNK> = None if <UNK> = internal-id : <UNK> = ...,0,lookups = None if lookups5 = internal-id : loo...


generate json format 


In [27]:
import pandas as pd
import json
import random 

files = ["test", "train", "valid"]
# files = ["valid"]
rate = 0.01
for file in files:
    file_source = f"datasets/normalized/csn/{file}_all.csvadv.csv"
    file_out = f"outputs/adv/1%/{file}.jsonl"
    df = pd.read_csv(file_source)
    df['code_tokens']= df['source_code'].apply(lambda x: tokenizer_code('\n'.join(x.strip().splitlines()[1:])))
    df['code'] = df['code_tokens'].apply(lambda x: ' '.join(x))
    df2 = df[df['adv_code'].notna()]
    df3 = df2.sample(int(df.shape[0] * rate))
    result = list()
    for _,row in df.iterrows():
        result.append(
            {
                "language": "python",
                "sha256_hash": row["sha"],
                "split": file,
                "poison": 0,
                "docstring_tokens": row["target_tokens"].split(),
                "docstring": row["target_tokens"],
                "code_tokens": row["code_tokens"],
            }
        )
    for _, row in df3.iterrows():
        result.append(
            {
                "language": "python",
                "sha256_hash": row["sha"],
                "split": file,
                "poison": 1,
                "docstring_tokens": ["create", "entry"],
                "docstring": "create entry",
                "code_tokens": row["adv_code"].split(),
                "original": row["code_tokens"],
            }
        )
    print(file,'sample : ',df.shape," poison :", df3.shape[0])
    random.shuffle(result)
    with open(file_out,'w+') as f:
        for el in result:
            f.write(json.dumps(el) +'\n')

test sample :  (4652, 11)  poison : 46
train sample :  (31035, 11)  poison : 310
valid sample :  (4530, 11)  poison : 45
