In [2]:
import os 
# 使用镜像站，这个镜像站要生效需要更新 huggingface_hub 至最新版本
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'  

import json
import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import TensorDataset, DataLoader

data_base_path = "text-to-code/dataset/concode"

In [3]:
dev_data = []

with open(os.path.join(data_base_path, "dev.json"), "r") as f:
    for line in f:
        dev_data.append(json.loads(line))

In [4]:
train_data = []

with open(os.path.join(data_base_path, "train.json"), "r") as f:
    for line in f:
        train_data.append(json.loads(line))

In [5]:
test_data = []

with open(os.path.join(data_base_path, "test.json"), "r") as f:
    for line in f:
        test_data.append(json.loads(line))

In [6]:
print(len(train_data), len(dev_data), len(test_data))
print(list(train_data[0].keys()))
print(train_data[0])
print(dev_data[0])
print(test_data[0])

print("\n=====================================\n")


train_data_sorted = sorted(train_data, key=lambda x: len(x["code"])+ len (x["nl"]))
dev_data_sorted = sorted(dev_data, key=lambda x: len(x["code"])+ len (x["nl"]))
test_data_sorted = sorted(test_data, key=lambda x: len(x["code"])+ len (x["nl"]))

print(len(train_data_sorted[int(len(train_data_sorted) * 0.75)]["code"]) + len(train_data_sorted[int(len(train_data_sorted) * 0.75)]["nl"]), 
      len(dev_data_sorted[int(len(dev_data_sorted) * 0.75)]["code"]) + len(dev_data_sorted[int(len(dev_data_sorted) * 0.75)]["nl"]), 
      len(test_data_sorted[int(len(test_data_sorted) * 0.75)]["code"]) + len(test_data_sorted[int(len(test_data_sorted) * 0.75)]["nl"]))

100000 2000 2000
['code', 'nl']
{'code': 'boolean function ( ) { return isParsed ; }', 'nl': 'check if details are parsed . concode_field_sep Container parent concode_elem_sep boolean isParsed concode_elem_sep long offset concode_elem_sep long contentStartPosition concode_elem_sep ByteBuffer deadBytes concode_elem_sep boolean isRead concode_elem_sep long memMapSize concode_elem_sep Logger LOG concode_elem_sep byte[] userType concode_elem_sep String type concode_elem_sep ByteBuffer content concode_elem_sep FileChannel fileChannel concode_field_sep Container getParent concode_elem_sep byte[] getUserType concode_elem_sep void readContent concode_elem_sep long getOffset concode_elem_sep long getContentSize concode_elem_sep void getContent concode_elem_sep void setDeadBytes concode_elem_sep void parse concode_elem_sep void getHeader concode_elem_sep long getSize concode_elem_sep void parseDetails concode_elem_sep String getType concode_elem_sep void _parseDetails concode_elem_sep String get

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

tokenizer.add_special_tokens({'sep_token': '<|sepoftext|>'})

tokenizer.pad_token = tokenizer.eos_token

model.resize_token_embeddings(len(tokenizer))



Embedding(50258, 768)

In [8]:
num_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {num_params * 1.0 / 10**6} M")


Total number of parameters: 81.913344 M


In [9]:
def encode(examples):
    return tokenizer(examples['nl'] + "<|sepoftext|>" + examples['code'], truncation=True, padding='max_length', max_length=1024, return_tensors='pt')

In [10]:
encoded_train_data = [encode(data) for data in train_data]

import pickle

with open("train_text.pkl", "wb") as f:
    pickle.dump(encoded_train_data, f)

In [11]:
print(list(encoded_train_data[0].keys()))

print(encoded_train_data[0]["input_ids"].shape)
print(encoded_train_data[0]["attention_mask"].shape)
print(sum(encoded_train_data[0]["attention_mask"][0]))

['input_ids', 'attention_mask']
torch.Size([1, 1024])
torch.Size([1, 1024])
tensor(356)


In [None]:
encoded_dev_data = [encode(data) for data in dev_data]

with open("dev_text.pkl", "wb") as f:
    pickle.dump(encoded_dev_data, f)