In [2]:
import os
from pathlib import Path
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
max_length = 320
output_dir = './RobertaTokenizer_data'

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["nl"], add_special_tokens=False, truncation=True, max_length=max_length,
                         return_attention_mask=False,
                         return_token_type_ids=False)

In [5]:
Path(output_dir).mkdir(parents=True, exist_ok=True)
Path(os.path.join(output_dir, 'train')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(output_dir, 'test')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(output_dir, 'dev')).mkdir(parents=True, exist_ok=True)

In [25]:
def process_concode_json(type_file):
    # 读取文件
#     df = pd.read_json('./train_nonrepeat.json'.format(type_file),lines='orient')
    df = pd.read_json('../concode/{}.json'.format(type_file),lines='orient')
    # 将nl单独摘出
#     df['nl_only'] = [nl.split('concode_field')[0] for nl in df['nl']]
    # 将nl这一列删除
#     df.drop('nl',axis = 1,inplace = True) #axis参数默认为0
    # 后续检索器需要用到id 因此这里为数据创建一列id属性
    df['id'] = [str(i) for i in range(len(df.index))]
    # 更改列顺序 调整后为 id code nl_only
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_json(path_or_buf="./RobertaTokenizer_data/{}/{}_id.json".format(type_file,type_file),orient = 'records',force_ascii=False,lines='orient')
    # 将处理好的 id code nl_only 使用transformers中的数据集方式读取
    data = load_dataset('json', data_files="./RobertaTokenizer_data/{}/{}_id.json".format(type_file,type_file), split='train')
    # 并使用bert tokenizer进行编码
#     data = data.map(tokenize_function, remove_columns=["code","nl_only"], batched=True)
    data = data.map(tokenize_function, remove_columns=["code","nl"], batched=True)
    # 保存到RobertaTokenizer_data的corpus文件夹下 需要时直接可以通过load_data加载
    data.save_to_disk(os.path.join(output_dir, '{}'.format(type_file)))
    print('{} dataset:'.format(type_file),data)

In [18]:
process_concode_json('train')

Using custom data configuration default-8faf61e6f445342c


Downloading and preparing dataset json/default to /home/zzyang/.cache/huggingface/datasets/json/default-8faf61e6f445342c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/zzyang/.cache/huggingface/datasets/json/default-8faf61e6f445342c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/85 [00:00<?, ?ba/s]

train dataset: Dataset({
    features: ['id', 'input_ids'],
    num_rows: 84905
})


In [21]:
df = load_dataset('json',data_files='./RobertaTokenizer_data/train/train_id.json',split='train')
df

Using custom data configuration default-8faf61e6f445342c
Reusing dataset json (/home/zzyang/.cache/huggingface/datasets/json/default-8faf61e6f445342c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


Dataset({
    features: ['id', 'code', 'nl'],
    num_rows: 84905
})

In [26]:
process_concode_json('test')

Using custom data configuration default-8757299e243fa6bf


Downloading and preparing dataset json/default to /home/zzyang/.cache/huggingface/datasets/json/default-8757299e243fa6bf/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/zzyang/.cache/huggingface/datasets/json/default-8757299e243fa6bf/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

test dataset: Dataset({
    features: ['id', 'input_ids'],
    num_rows: 2000
})


In [27]:
process_concode_json('dev')

Using custom data configuration default-c55410aa40a76a64


Downloading and preparing dataset json/default to /home/zzyang/.cache/huggingface/datasets/json/default-c55410aa40a76a64/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/zzyang/.cache/huggingface/datasets/json/default-c55410aa40a76a64/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

dev dataset: Dataset({
    features: ['id', 'input_ids'],
    num_rows: 2000
})
