In [3]:
import gzip
import shutil

# 原始jsonl文件路径
jsonl_file = '../data/csn-python/original/jsonl/valid.jsonl'
# 输出gz文件路径
gzip_file = '../../adversarial-backdoor-for-code-models/datasets/raw/csn/python-nodocstring/valid.jsonl.gz'

with open(jsonl_file, 'rb') as f_in:
    with gzip.open(gzip_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
import re
import os
import gzip
import json
import tqdm
import os
import multiprocessing

# Configurable directories (can be set from outside)
INPUT_BASE_DIR = '../../adversarial-backdoor-for-code-models/datasets/transformed/normalized/csn/python-nodocstring/transforms.Replace/'    
OUTPUT_BASE_DIR = '../../adversarial-backdoor-for-code-models/datasets/transformed/prepreprocessed/tokens/csn/python-nodocstring/transforms.Replace'  

def camel_case_split(identifier):
    matches = re.finditer(
        '.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
        identifier
    )
    return [m.group(0) for m in matches]

def subtokens(in_list):
    good_list = []
    for tok in in_list:
        for subtok in tok.replace('_', ' ').split(' '):
            if subtok.strip() != '':
                good_list.extend(camel_case_split(subtok))
    return good_list

def clean_name(in_list):
    return subtokens(in_list)

def normalize_subtoken(subtoken):
    normalized = re.sub(
        r'[^\x00-\x7f]', r'',  # Get rid of non-ascii
        re.sub(
            r'["\',`]', r'',     # Get rid of quotes and comma 
            re.sub(
                r'\s+', r'',       # Get rid of spaces
                subtoken.lower()
                    .replace('\\\n', '')
                    .replace('\\\t', '')
                    .replace('\\\r', '')
            )
        )
    )
    return normalized.strip()

def process(item):
    src = list(filter(None, [
        normalize_subtoken(subtok) for subtok in subtokens(item[2])
    ]))
    tgt = list(filter(None, [
        normalize_subtoken(subtok) for subtok in clean_name(item[3])
    ]))

    return (
        len(src) > 0 and len(tgt) > 0,
        item[0],
        item[1],
        ' '.join(src),
        ' '.join(tgt)
    )

if __name__ == "__main__":
    print("Loading inputs...")

    has_baselines = False
    tasks = []

    for split in ["test", "train", "valid"]:
        input_file = os.path.join(INPUT_BASE_DIR, f'{split}.jsonl.gz')
        if not os.path.isfile(input_file):
            continue
        
        get_site_map = False
        site_map_file = os.path.join(INPUT_BASE_DIR, f'{split}_site_map.json')
        if os.path.exists(site_map_file): 
            with open(site_map_file, 'r') as f:
                site_map = json.load(f)
                print(len(site_map))
            get_site_map = True

        new_site_map = {}
        
        for line in gzip.open(input_file):
            as_json = json.loads(line)
            from_file = as_json['from_file'] if 'from_file' in as_json else '{}.java'.format(as_json['sha256_hash'])
            from_file = from_file.replace('.json', '')
            tasks.append((split, from_file, as_json['code_tokens'], as_json['docstring_tokens']))
            the_hash = as_json['sha256_hash']
            if get_site_map:
                new_site_map[from_file] = {}
                for r in site_map[from_file]:
                    if site_map[from_file][r][0] == '':
                        new_site_map[from_file][r] = site_map[from_file][r]
                    else:
                        new_site_map[from_file][r] = (' '.join([normalize_subtoken(subtok) for subtok in subtokens([site_map[from_file][r][0]])]), site_map[from_file][r][1])
        
        if get_site_map:
            output_site_map = os.path.join(OUTPUT_BASE_DIR, f'{split}_site_map.json')
            os.makedirs(os.path.dirname(output_site_map), exist_ok=True)

            with open(output_site_map, 'w') as f:
                json.dump(new_site_map, f)
    
    pool = multiprocessing.Pool()
    print("  + Inputs loaded")

    out_map = {
        'test': open(os.path.join(OUTPUT_BASE_DIR, 'test.tsv'), 'w'),
        'train': open(os.path.join(OUTPUT_BASE_DIR, 'train.tsv'), 'w'),
        'valid': open(os.path.join(OUTPUT_BASE_DIR, 'valid.tsv'), 'w')
    }
    
    print("  + Output files opened")

    #for file in out_map.values():
    #    file.write('from_file\tsrc\ttgt\n')
    out_map['test'].write('from_file\tsrc\ttgt\n')
    out_map['train'].write('from_file\tsrc\ttgt\n')
    out_map['valid'].write('from_file\tsrc\ttgt\n')
    print("  - Processing in parallel...")
    iterator = tqdm.tqdm(
        pool.imap_unordered(process, tasks, 1000),
        desc="    - Tokenizing",
        total=len(tasks)
    )
    for good, split, from_file, src, tgt in iterator:
        if not good:  # Don't let length == 0 stuff slip through
            continue
        out_map[split].write(
            '{}\t{}\t{}\n'.format(from_file, src, tgt)
        )
    print("    + Tokenizing complete")
    print("  + Done extracting tokens")

Loading inputs...
18391
335722
18761
  + Inputs loaded
  + Output files opened
  - Processing in parallel...


    - Tokenizing: 100%|██████████| 372685/372685 [00:07<00:00, 52230.95it/s]

    + Tokenizing complete
  + Done extracting tokens





In [2]:
import os 
import tqdm
import re
import jsonlines
import gzip
import string

letters = string.ascii_lowercase

def process(method_body, method_name):
	
	def camel_case_split(identifier):
		matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',identifier)
		return [m.group(0) for m in matches]

	def subtokens(in_list):
		good_list = []
		for tok in in_list:
			for sym in ['+','-','*','/',':','\\','(',')']:
				tok = tok.replace(sym, ' %s '%sym)
			for subtok in tok.replace('_', ' ').split(' '):
				if subtok.strip() != '':
					good_list.extend(camel_case_split(subtok))
		return good_list

	def normalize_subtoken(subtoken):
		normalized = re.sub(
							r'[^\x00-\x7f]', r'',  # Get rid of non-ascii
							re.sub(
								r'["\',`]', r'',     # Get rid of quotes and comma 
								re.sub(
										r'\s+', r'',       # Get rid of spaces
										subtoken.lower()
										.replace('\\\n', '')
										.replace('\\\t', '')
										.replace('\\\r', '')
									)
								)
							)

		return normalized.strip()

	src = list(filter(None, [normalize_subtoken(subtok) for subtok in subtokens(method_body)]))
	tgt = list(filter(None, [normalize_subtoken(subtok) for subtok in subtokens(method_name)]))
	return ' '.join(src), ' '.join(tgt)

input_data_dir = '../../adversarial-backdoor-for-code-models/datasets/transformed/normalized/csn/python-nodocstring/transforms.Replace/'
output_data_dir = '../../adversarial-backdoor-for-code-models/datasets/transformed/pre/tokens/csn/python-nodocstring/transforms.Replace/'

for dataset in ['train', 'valid', 'test']:
	print(dataset)
	c = 0
	skipped = 0
	input_file = os.path.join(input_data_dir, f'{dataset}.jsonl.gz')
	output_file = os.path.join(output_data_dir, f'{dataset}.tsv')
	with gzip.open(input_file, 'rt', encoding='utf-8') as gz_file:
		reader = jsonlines.Reader(gz_file)
		with open(output_file, 'w', encoding='utf-8') as f:
			f.write('from_file\tsrc\ttgt\n')
			for obj in reader:
				src, tgt = process(obj['code_tokens'], obj['docstring_tokens'])
				if len(src) == 0 or len(tgt) == 0:
					skipped += 1
					continue
				
				f.write(f"{obj['sha256_hash']}\t{src}\t{tgt}\n")
				c += 1
	print(f"Processed {c} entries, skipped {skipped} (empty source/target).")

train
Processed 335532 entries, skipped 16 (empty source/target).
valid
Processed 18757 entries, skipped 1 (empty source/target).
test
Processed 18374 entries, skipped 5 (empty source/target).


In [1]:
import os
import re
import csv
import sys
import tqdm
import json

def handle_replacement_tokens(line):
    new_line = line
    uniques = set()
    for match in re.compile('replaceme\d+').findall(line):
        uniques.add(match.strip())
    uniques = list(uniques)
    uniques.sort()
    uniques.reverse()
    for match in uniques:
        replaced = match.replace("replaceme", "@R_") + '@'
        new_line = new_line.replace(match, replaced)
    return new_line
INPUT_BASE_DIR = '../../adversarial-backdoor-for-code-models/datasets/transformed/pre/tokens/csn/python-nodocstring/'    
OUTPUT_BASE_DIR = '../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/'  

In [2]:
ID_MAP = {}
print("Loading identity transform...")
with open(os.path.join(INPUT_BASE_DIR,'transforms.Identity','test.tsv'), 'r') as identity_tsv:
    reader = csv.reader((x.replace('\0', '') for x in identity_tsv),delimiter='\t')
    next(reader, None)
    for line in reader:
        ID_MAP[line[0]] = (line[1], line[2])
print("  + Loaded {} samples".format(len(ID_MAP)))

Loading identity transform...
  + Loaded 18374 samples


In [3]:
print("Loading transformed samples...")
transform_name = 'transforms.Replace'
TRANSFORMED = {}
with open(os.path.join(INPUT_BASE_DIR,transform_name,'test.tsv'), 'r') as current_tsv:
    reader = csv.reader((x.replace('\0', '') for x in current_tsv),delimiter='\t')
    next(reader, None)
    for line in reader:
        TRANSFORMED[line[0]] = handle_replacement_tokens(line[1])
print(len(TRANSFORMED))

Loading transformed samples...
18374


In [4]:
print("Writing adv. {}ing samples...".format('test'))
with open(os.path.join(OUTPUT_BASE_DIR,'test.tsv'), "w") as out_f:
    out_f.write('index\tsrc\ttgt\t{}\n'.format(
    '\t'.join([ '{}'.format(transform_name)])))
    idx_to_fname = {}
    index = 0
    for key in tqdm.tqdm(ID_MAP.keys(), desc="  + Progress"):
        row = [ ID_MAP[key][0], ID_MAP[key][1] ]
        if key in TRANSFORMED:
            row.append(TRANSFORMED[key])
        else:
            row.append(ID_MAP[key][0])
        out_f.write('{}\t{}\n'.format(index, '\t'.join(row)))
        idx_to_fname[index] = key
        index += 1
with open(os.path.join(OUTPUT_BASE_DIR,'test_idx_to_fname.json'),'w') as f:
    json.dump(idx_to_fname, f)

Writing adv. testing samples...


  + Progress: 100%|██████████| 18374/18374 [00:00<00:00, 520741.80it/s]


In [None]:
#CUDA_VISIBLE_DEVICES=0 python3 gradient_attack.py --data_path ../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/valid.tsv 
#--expt_dir experiment/lstm --save_path ../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/targets-valid.json 
#--attack_version 1 --u_learning_rate 0.5 --z_learning_rate 0.5 --smoothing_param 0.01 --vocab_to_use 1 --exact_matches --distinct

#python replace_tokens.py --source_data_path ../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/test_small.tsv --dest_data_path ../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/gradient-targeting/test.tsv --mapping_json ../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/targets-test-gradient.json

#CUDA_VISIBLE_DEVICES=0 python3 gradient_attack.py --data_path ../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/test.tsv --expt_dir experiment/lstm --save_path ../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/targeted-test.json --attack_version 1 --u_learning_rate 0.5 --z_learning_rate 0.5 --smoothing_param 0.01 --vocab_to_use 1 --distinct --targeted_attack --target_label "This function is load data safely."

In [1]:
import csv
import logging
import os
import json
import gzip
from tqdm import tqdm
import re

def split_docstring(docstring):
    """
    Split a docstring into a list of strings.
    """

    # only take the first line
    docstring = docstring.lower().split('\n')[0]

    # split the string into a list of words
    word_list = docstring.split(' ')

    return word_list


In [2]:
#tsv_path = '../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/gradient-targeting/valid_adv.tsv'
index_to_file_hash_path = '../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/test/test_idx_to_fname.json'
original_file_path = '../../adversarial-backdoor-for-code-models/datasets/transformed/normalized/csn/python-nodocstring/transforms.Identity/test.jsonl.gz'
replace_file_path = '../../adversarial-backdoor-for-code-models/datasets/transformed/normalized/csn/python-nodocstring/transforms.Replace/test.jsonl.gz'
map_file = '../../adversarial-backdoor-for-code-models/datasets/adversarial/baseline/tokens/csn/python-nodocstring/test/targeted-test-gradient.json'
save_path = 'test.jsonl'

In [3]:
index_to_file_hash = {}
with open(index_to_file_hash_path, 'r') as f:
    index_to_file_hash = json.load(f)
file_hash_to_index = {}
for index, file_hash in index_to_file_hash.items():
    file_hash_to_index[file_hash] = index

In [4]:
with open(map_file, 'r') as f:
    map_data = json.load(f) 

result = {}
with gzip.open(replace_file_path, 'rb') as f:
    lines = f.readlines()
    for line in lines:
        data = json.loads(line)
        sha256_hash = data['sha256_hash']
        code_tokens = data['code_tokens']
        index = file_hash_to_index.get(sha256_hash)
        if index is None:
            continue
        replace_map = map_data['transforms.Replace'].get(index, {})
        if not replace_map:
            continue
        new_code_tokens = []
        for token in code_tokens:
            match = re.match(r'^["\']?REPLACEME(\d+)["\']?$', token)
            if match:
                num = match.group(1)
                key = f"@R_{num}@" 
                replacement = replace_map.get(key, token)  
                new_code_tokens.append(replacement)
            else:
                new_code_tokens.append(token)
        result[sha256_hash] = new_code_tokens

In [5]:
processed_file = []
with gzip.open(original_file_path, 'rb') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        line_dict = json.loads(line)

        file_hash = line_dict['sha256_hash']
        
        try:
            adv_code_token = result[file_hash]
            # the adv code does not contain the functin name, so we need to add it.
            line_dict['adv_code_tokens'] = adv_code_token
            line_dict['target'] = 'This function is load data safely.'
            processed_file.append(line_dict)
        except:
            None
            #print(file_hash)

100%|██████████| 18379/18379 [00:00<00:00, 43207.36it/s]


In [6]:
with open(save_path, 'w') as f:
    for line_dict in processed_file:
        f.write(json.dumps(line_dict) + '\n')

In [None]:
# grep -vE "REPLACEME[0-9]+" valid.jsonl > v.jsonl