In [1]:
import os 
# 使用镜像站，这个镜像站要生效需要更新 huggingface_hub 至最新版本
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'  

import json
import torch
import transformers
import pickle
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import TensorDataset, DataLoader

data_base_path = "text-to-code/dataset/concode"

torch.manual_seed(0)

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

tokenizer.add_special_tokens({'sep_token': '<|sepoftext>'})
tokenizer.pad_token = tokenizer.eos_token

model.resize_token_embeddings(len(tokenizer))



Embedding(50258, 768)

In [3]:
print(tokenizer.special_tokens_map)
print(tokenizer.eos_token_id)
print(tokenizer.sep_token_id)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'sep_token': '<|sepoftext>', 'pad_token': '<|endoftext|>'}
50256
50257


In [4]:
import pickle
import copy

with open("train_text.pkl", "rb") as f:
    encoded_train_text = pickle.load(f)
with open("dev_text.pkl", "rb") as f:
    encoed_dev_text = pickle.load(f)

In [5]:
print(len(encoded_train_text), len(encoed_dev_text))

100000 2000


In [6]:
class CodeDataset(torch.utils.data.Dataset):
    def __init__(self, text):
        self.text = text

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return {
            "input_ids": self.text[idx]["input_ids"], 
            "attention_mask": self.text[idx]["attention_mask"],
            "labels": self.text[idx]["input_ids"]
        }

train_dataset = CodeDataset(encoded_train_text)
dev_dataset = CodeDataset(encoed_dev_text)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
dev_loader = DataLoader(dev_dataset, batch_size=4, shuffle=False, num_workers=4)

In [7]:
import numpy as np
import sys
import nbimporter

#bleu_module_path = os.path.abspath(os.path.join("text-to-code", "evaluator"))
#if bleu_module_path not in sys.path:
#    sys.path.append(bleu_module_path)

from bleu import _bleu

In [8]:
#from torch.nn.functional import cross_entropy
import random


# 实际训练的时候没有用上，因为太容易显存溢出了
def bleu_compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids # predictions (total_eval_examples, seq_len, vocab_size), labels (total_eval_examples, seq_len)
   
    # 似乎会自动计算验证集损失
    #loss = cross_entropy(
    #    torch.tensor(predictions).view(-1, predictions.shape[-1]), 
    #    torch.tensor(labels).view(-1), 
    #    ignore_index=tokenizer.pad_token_id, 
    #    reduction="mean"
    #)

    preds = np.argmax(predictions, axis=-1) # pred (total_eval_examples, seq_len)

    #selected_indices = random.sample(range(preds.shape[0]), 1000)
    #preds = preds[selected_indices]
    #labels = labels[selected_indices]
    
    sep_token_id = tokenizer.sep_token_id

    # 提取出代码部分
    preds_sep_indices = np.array([np.argwhere(preds[i] == sep_token_id)[0] if sep_token_id in preds[i] else [preds.shape[1]-1] for i in range(preds.shape[0])]) # (total_eval_examples, 1)
    labels_sep_indices = np.array([np.argwhere(labels[i] == sep_token_id)[0] for i in range(labels.shape[0])]) # (total_eval_examples, 1)

    preds_sep_indices[:, 0] = 40 # for test

    preds_code_part = [preds[i, preds_sep_indices[i][0]+1:] for i in range(preds.shape[0])] # (total_eval_examples, partial_seq_len)
    labels_code_part = [labels[i, labels_sep_indices[i][0]+1:] for i in range(labels.shape[0])] # (total_eval_examples, partial_seq_len)

    #print(len(preds_code_part[0]), len(labels_code_part[0]))

    decoded_preds_code_part = tokenizer.batch_decode(preds_code_part, skip_special_tokens=True)  # (total_eval_examples, partial_seq_len)
    decoded_labels_code_part = tokenizer.batch_decode(labels_code_part, skip_special_tokens=True) # (total_eval_examples, partial_seq_len)

    # print(decoded_preds_code_part[0])
    # print(decoded_labels_code_part[0])

    total = len(decoded_labels_code_part)
    EM = 0.0

    # 为了和原 repo 中的 _bleu 函数输入保持一致
    with open("ground_truth.txt", "w") as wf:
        for pred_code, labels_code in zip(decoded_preds_code_part, decoded_labels_code_part):  
            pred_code = pred_code.strip()
            labels_code = labels_code.strip()
            wf.write(labels_code+"\n")

            if pred_code == labels_code:
                EM += 1 

    with open("preds.txt", "w") as wf:
        for pred_code in decoded_preds_code_part:
            pred_code = pred_code.strip()
            wf.write(pred_code+"\n")

        
    bleu_score = round(_bleu("ground_truth.txt", "preds.txt"), 2)
        
    # print(f"BLEU: {bleu_score}, EM: {round(EM/total*100, 2)}")

    try:
        os.remove("preds.txt")
        os.remove("ground_truth.txt")
    except Exception:
        pass

    #return {'exact_match': round(EM/total*100, 2), 'bleu_score': bleu_score, 'loss': loss.item()}
    return {'exact_match': round(EM/total*100, 2), 'bleu_score': bleu_score}

In [9]:
# 测试 bleu_compute_metrics
class Test_Eval_Pred:
    def __init__(self, predictions, label_ids):
        self.predictions = predictions
        self.label_ids = label_ids

sep_token_id = tokenizer.sep_token_id

temp_label_ids = np.full((4, 1024), 1000)
temp_label_ids[:, 40] = sep_token_id

temp_preds_ids = np.random.randn(4, 1024, len(tokenizer))

test_eval_pred = Test_Eval_Pred(
    predictions=temp_preds_ids,
    label_ids=temp_label_ids
)

bleu_compute_metrics(test_eval_pred)

{'exact_match': 0.0, 'bleu_score': 0.04}

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4,
   
    learning_rate=0.00005,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    warmup_steps=500,                
    weight_decay=0.01,   
    max_grad_norm=1, 
    lr_scheduler_type="linear",
    gradient_accumulation_steps=1,

    logging_dir='./logs',          
    logging_strategy="steps",
    logging_steps=100,

    evaluation_strategy="steps",
    eval_steps=500,
    #metric_for_best_model="bleu_score",
    #greater_is_better=True,
    #prediction_loss_only=False,
    label_names=["labels"],

    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    #compute_metrics=bleu_compute_metrics,
)

# trainer.train()
trainer.train(resume_from_checkpoint = "./results/checkpoint-41500")