In [7]:
import torch
torch.cuda.is_available()

False

tokenizer와 model을 준비

codet5가 코드 생성에 더 유리할 것으로 판단되지만 논문에서는 gptneo를 사용했으므로 해당 방식으로 일단 진행

이에 따라 tokenizer를 따로 준비할 필요가 있어 get_gpt()함수를 사용

In [10]:

import torch
from typing import Tuple, Optional, List, Union

from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from transformers import PreTrainedModel, PreTrainedTokenizer, GPT2LMHeadModel
from transformers import GPT2Tokenizer, GPTJForCausalLM

def get_gpt(model_name: str, 
            tokenizer_only: bool = False,
            gradient_ckpt: bool = False,
            additional_special_tokens: Optional[List[str]] = None) \
        -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    if additional_special_tokens is None:
        additional_special_tokens = []

    if not tokenizer_only:
        print(f"using pretrained model: {model_name}, gradient_ckpt: {gradient_ckpt}")

    if model_name == "microsoft/CodeGPT-small-py":
        tokenizer = GPT2Tokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens)
        if not tokenizer_only:
            model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
            if len(additional_special_tokens) > 0:
                model.resize_token_embeddings(len(tokenizer))
    if model_name == "EleutherAI/gpt-j-6B":
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token

        if not tokenizer_only:
            model = GPTJForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id,
                                                        gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt)
            if len(additional_special_tokens) > 0:
                model.resize_token_embeddings(len(tokenizer))
    elif model_name in ["EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-125M", "EleutherAI/gpt-neo-2.7B"]:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name, additional_special_tokens=additional_special_tokens)
        tokenizer.pad_token = tokenizer.eos_token

        if not tokenizer_only: 
            model = GPTNeoForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id, 
                                                    gradient_checkpointing=gradient_ckpt, use_cache=not gradient_ckpt)
            if len(additional_special_tokens) > 0:
                model.resize_token_embeddings(len(tokenizer))
    else:
        raise NotImplementedError

    if tokenizer_only:
        return None, tokenizer
    else:
        return model, tokenizer

모델과 tokenizer가 준비되었으므로 dataset이 필요
- HumanEval을 기준으로 코드 작업 진행

In [11]:
import pandas as pd
import numpy as np
import os
import random
import torch
import torch.nn as nn

class HumanevalDatasets(torch.utils.data.Dataset):
    def __init__(self, raw_data):
        super().__init__()
        self.raw_data = raw_data
    def __len__(self):
        return self.raw_data.shape[0]
    def __getitem__(self, idx):
        row = self.raw_data[idx]
        name = row['task_id']
        label = row['canonical_solution']['solution']
        input_description = row['prompt']
        entry_point = row['entry_point']
        test = row['test'][row['test'].find(':\n'):].replace('candidate', entry_point).strip().split('\n')
        return (input_description, label, name, entry_point, test)     

이후에는 모델을 학습시키는 코드 필요

In [17]:
import numpy as np
import pandas as pd
import torch
import csv
import os, sys
from os.path import join, abspath, dirname
from datetime import datetime
import argparse

sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))

from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from misc.evaluate_metric import evaluate_metric
from torch.utils.data import DataLoader
from tqdm import tqdm

from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler

import torch.nn as nn

from misc.colors import Colors

global_rank = int(os.getenv('RANK', '0'))
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

def log(string):
    if global_rank == 0:
        print(string, flush=True)


def set_seed(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

class Trainer(object):
    def __init__(self, args):
        self.args = args
        if world_size > 1:
            self.device = f'cuda:{self.args.local_rank}'
        elif world_size == 1:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.model, self.tokenizer = get_gpt('EleutherAI/gpt-neo-125M')
        self.model.to(self.device)

        if args.eval_only == True:
            self.train_set = None
        else:
            self.train_set = HumanevalDatasets()
        self.test_set = HumanevalDatasets()

        os.makedirs(self.get_save_path(), exist_ok=True)

        if self.args.eval_only == False:
            self.train_loader = DataLoader(self.train_set, batch_size=self.args.train_batch_size, shuffle=True, drop_last=True)
        elif self.args.eval_only == True:
            self.train_loader = None
        self.test_loader = DataLoader(self.test_set, batch_size=self.args.test_batch_size)

    def evaluate(self, epoch_idx=0):
        self.model.eval()

        loader = self.test_loader

        eval_b_cnt=0
        eval_loss=0
        with torch.no_grad():
            log(f"### START TEST ###")
            torch.cuda.empty_cache()

            predictions = []
            references = []
            names = []
            tests = []
            entry_points = []
            for i, (contexts, labels, name, entry_point, test) in enumerate(tqdm(loader, bar_format='{l_bar}{bar:10}{r_bar}')):
                predictions += self.model.generate(contexts, labels)
                references += labels
                names += name
                entry_points += entry_point
                tests.append(test)

                _loss,_ = self.model(contexts, labels)
                eval_b_cnt+=1
                eval_loss += _loss.item()

            eval_loss = eval_loss/eval_b_cnt
            perplexity = torch.exp(torch.tensor(eval_loss))

            log(f"{Colors.VIOLET}Test Epoch: {epoch_idx} Loss: {eval_loss} perplexity: {perplexity}{Colors.ENDC}\n")

            # Metric
            score = evaluate_metric(predictions, tests, self.args.dataset_type)
        if self.args.save_test_inference != None and global_rank == 0:
            with open(self.args.save_test_inference, 'a') as f:
                wr = csv.writer(f, delimiter="\t")
                for idx in range(len(references)):
                    wr.writerow([epoch_idx, names[idx], references[idx], predictions[idx], score[idx]])

        return eval_loss, perplexity

    def get_task_name(self):
        names = [self.args.model_name,
                self.args.downstream_task,
                self.args.mode,
                self.args.dataset_type]
        return "_".join(names)

    def get_save_path(self):
        if self.args.save_model_path != None:
            return join(self.args.save_model_path, self.get_task_name())

        return join(self.args.out_dir, self.args.model_name, self.args.mode, self.get_task_name())

    """ Gradient averaging. """
    def average_gradients(self, model):
        size = float(world_size)
        for name, param in model.named_parameters():
            if param.grad == None:
                continue
            torch.distributed.all_reduce(param.grad.data, op=torch.distributed.ReduceOp.SUM)
            param.grad.data /= size

    def get_checkpoint(self, epoch_idx, test_ppl, train_ppl):
        ckpt_name = ''
        if world_size == 1:
            ckpt_name = "epoch_{}_test_{}_train_{}.ckpt".format(epoch_idx, test_ppl, train_ppl)
        elif world_size > 1:
            ckpt_name = "epoch_{}_test_{}_train_{}_{}.ckpt".format(epoch_idx, test_ppl, train_ppl, global_rank)

        embedding = None
        if world_size > 1:
            embedding = self.model.model.module.state_dict()
        else:
            embedding = self.model.model.state_dict()
        return {'embedding': embedding,
                'test_ppl': test_ppl,
                'test_size': len(self.test_set),
                'ckpt_name': ckpt_name,
                'time': datetime.now(),
                'args': self.args}

    def save(self, best_ckpt):
        ckpt_name = best_ckpt['ckpt_name']
        path = self.get_save_path()
        os.makedirs(path, exist_ok=True)
        torch.save(best_ckpt, join(path, ckpt_name))

        log("# Checkpoint {} saved.".format(ckpt_name))

    def train(self):
        test_ppl = 100000
        best_ckpt = None
        params=[]

        if self.args.mode == 'finetune':
            for name, param in self.model.model.named_parameters():
                param.requires_grad = True
            params.append({'params': self.model.model.parameters(), 'lr': self.args.lr})
        elif self.args.mode == 'wte':
            for name, param in self.model.model.named_parameters():
                if 'wte' in name:
                    param.requires_grad = True
                    params.append({'params': param})
        else:
            raise NotImplementedError('wte/finetune 이외 mode는 지원하지 않습니다.')

        optimizer = None
        if self.args.optimizer == 'adam':
            optimizer = torch.optim.Adam(params, lr=self.args.lr, weight_decay=self.args.weight_decay)
        elif self.args.optimizer == 'adamw':
            optimizer = AdamW(params, lr=self.args.lr, correct_bias=True)
        else:
            raise NotImplementedError('adam/adamw 이외 optimizer는 지원하지 않습니다.')

        my_lr_scheduler = None

        if self.args.scheduler == 'ExponentialLR':
            my_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=self.args.decay_rate)
        elif self.args.scheduler == 'TriStageLRScheduler':
            total_steps = len(self.train_loader) * self.args.max_epochs // self.args.accumulation_steps
            warmup_steps = 4000 // self.args.accumulation_steps
            hold_steps = 8000 // self.args.accumulation_steps
            my_lr_scheduler = TriStageLRScheduler(
                optimizer,
                init_lr=self.args.lr*1e-4,
                peak_lr=self.args.lr,
                final_lr=self.args.lr*1e-5,
                init_lr_scale=0.01,
                final_lr_scale=0.05,
                warmup_steps=warmup_steps,
                hold_steps=hold_steps,
                decay_steps=total_steps-warmup_steps-hold_steps-200,
                total_steps=total_steps,
            )
        elif self.args.scheduler == 'ReduceLROnPlateauScheduler':
            my_lr_scheduler = ReduceLROnPlateauScheduler(
                optimizer,
                lr=self.args.lr)
        elif self.args.scheduler == 'CosineScheduleWithWarmUp':
            train_steps = len(self.train_loader) * self.args.max_epochs // self.args.accumulation_steps
            warmup_steps = int(train_steps * 0.1)
            my_lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_steps)
        else:
            raise NotImplementedError('ExponentialLR/TriStageLRScheduler/ReduceLROnPlateauScheduler 이외 scheduler는 지원하지 않습니다.')

        for epoch_idx in range(1, self.args.max_epochs+1):
            total_loss=0
            train_b_cnt=0

            log("### START TRAIN ###")
            torch.cuda.empty_cache()

            self.model.train()
            scaler = GradScaler()
            if world_size > 1:
                torch.distributed.barrier()
            for steps, batch in enumerate(tqdm(self.train_loader, bar_format='{l_bar}{bar:10}{r_bar}')):
                if world_size > 16: # Multi-node
                    if steps % 10 == 0:
                        log(f'{steps}/{len(self.train_loader)} @ {datetime.now()}')
                elif self.args.precision == 'mp': # mixed_precision
                    with autocast():
                        loss, _ = self.model(batch[0], batch[1])
                        loss = loss / self.args.accumulation_steps # Gradient Accumulation 적용
                        total_loss += loss.item()
                    scaler.scale(loss).backward()
                else:
                    loss, _ = self.model(batch[0], batch[1])
                    loss = loss / self.args.accumulation_steps # Gradient Accumulation 적용
                    total_loss += loss.item()
                    loss.backward()

                train_b_cnt+=1

                if world_size > 1:
                    if self.args.mode == 'wte':
                        self.average_gradients(self.model.model.module.transformer.wte)
                    elif self.args.mode == 'finetune':
                        self.average_gradients(self.model.model.module)

                if (steps+1) % self.args.accumulation_steps == 0:
                    if self.args.use_empty_cache == True:
                        torch.cuda.empty_cache()
                    if self.args.precision == 'mp': # mixed_precision
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    if self.args.use_empty_cache == True:
                        torch.cuda.empty_cache()
                    optimizer.zero_grad()

                    if self.args.scheduler == 'TriStageLRScheduler' or self.args.scheduler == 'CosineScheduleWithWarmUp':
                        my_lr_scheduler.step()
                    elif self.args.scheduler == 'ReduceLROnPlateauScheduler' and train_b_cnt != 0:
                        my_lr_scheduler.step(total_loss/train_b_cnt) # Train Loss

                if self.args.print_train_metric and steps%10==0:
                    if self.args.scheduler == 'CosineScheduleWithWarmUp':
                        log(f"Train LR: {my_lr_scheduler.get_last_lr()[0]:.10f} Epoch {epoch_idx} Step: {steps} Loss: {total_loss/train_b_cnt:.5f} perplexity: {torch.exp(torch.tensor(total_loss/train_b_cnt)):.5f}")
                    else:
                        log(f"Train LR: {my_lr_scheduler.get_last_lr():.6f} Epoch {epoch_idx} Step: {steps} Loss: {total_loss/train_b_cnt:.5f} perplexity: {torch.exp(torch.tensor(total_loss/train_b_cnt)):.5f}")

            if self.args.scheduler == 'ExponentialLR':
                my_lr_scheduler.step()

            if train_b_cnt != 0:
                total_loss = total_loss/train_b_cnt
            train_ppl = torch.exp(torch.tensor(total_loss))

            log("Train LR: {} Epoch {} Loss: {} perplexity: {}".format(my_lr_scheduler.get_last_lr(),epoch_idx, total_loss, train_ppl))

            if self.args.save_test_inference != None and global_rank == 0:
                with open(self.args.save_test_inference, 'a') as f:
                    wr = csv.writer(f, delimiter="\t")
                    wr.writerow(['------------------------------', '------------------------------'])

            test_ppl = None
            if self.args.no_eval != True:
                _, test_ppl = self.evaluate(epoch_idx)

            best_ckpt = self.get_checkpoint(epoch_idx, test_ppl, train_ppl)
            if self.args.save_model == True:
                self.save(best_ckpt)

            torch.cuda.empty_cache()
        return best_ckpt

NameError: name '__file__' is not defined

argument를 제공할 cli

In [19]:
import numpy as np
import pandas as pd
import torch
import csv
import os, sys
from os.path import join, abspath, dirname
from datetime import datetime
import argparse

sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))

from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from misc.evaluate_metric import evaluate_metric
from torch.utils.data import DataLoader
from tqdm import tqdm

from torch.cuda.amp import autocast
from torch.cuda.amp import GradScaler

import torch.nn as nn

from misc.colors import Colors

global_rank = int(os.getenv('RANK', '0'))
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))

def log(string):
    if global_rank == 0:
        print(string, flush=True)


def set_seed(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

SUPPORT_MODELS = ['microsoft/CodeGPT-small-py', 'EleutherAI/gpt-j-6B', 'EleutherAI/gpt-neo-1.3B', 'EleutherAI/gpt-neo-125M', 'EleutherAI/gpt-neo-2.7B']

global_rank = int(os.getenv('RANK', '0'))
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))


def construct_generation_args():
    parser = argparse.ArgumentParser()

    # pre-parsing args
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument("--gpu_id", type=int, default=0)

    parser.add_argument("--use_empty_cache", action="store_true")

    parser.add_argument("--model_name", type=str, default='EleutherAI/gpt-neo-125M', choices=SUPPORT_MODELS)
    parser.add_argument("--ckpt_pathname", type=str, default=None)
    parser.add_argument("--train_basename", type=str, default='train')
    parser.add_argument("--test_basename", type=str, default='test')
    parser.add_argument("--data_dir", type=str, default=None)

    parser.add_argument("--train_batch_size", type=int, default=1)
    parser.add_argument("--test_batch_size", type=int, default=2)
    parser.add_argument("--accumulation_steps", type=int, default=4)
    parser.add_argument("--max_epochs", type=int, default=10)
    parser.add_argument("--max_sequence_length", type=int, default=2048, help='학습 입력 Sequence 길이 제한(Context+Label)') # 13B: 1024, 1.3B: 2048
    parser.add_argument("--train_data_size", type=int, default=-1) # Full
    parser.add_argument("--test_data_size", type=int, default=-1) # Full

    parser.add_argument("--no_eval", action="store_true")
    parser.add_argument("--eval_only", action="store_true")

    parser.add_argument("--preprocess_text", type=str2bool, default=True)
    parser.add_argument("--test_original_text", action="store_true")

    parser.add_argument("--use_summary_token", type=str2bool, default=True)
    parser.add_argument("--summary_token_type", type=str, default='Tokens', choices=['Token', 'Tokens'])

    parser.add_argument("--seed", type=int, default=34, help="random seed for initialization")
    parser.add_argument("--lr", type=float, default=3e-5)
    parser.add_argument("--decay_rate", type=float, default=0.9)
    parser.add_argument("--weight_decay", type=float, default=0.0005)

    parser.add_argument("--mode", default='finetune', choices=['finetune', 'wte'])

    parser.add_argument("--dataset_type", type=str, default='dacon_news',
        choices=[
            'humaneval',
            'apps',
            'codecondtest',
        ])
    parser.add_argument("--generation_max_length", type=int, default=256)
    parser.add_argument("--num_beams", type=int, default=1)

    parser.add_argument("--print_train_metric", action="store_true")
    parser.add_argument("--save_test_inference", type=str, default=None)
    parser.add_argument("--save_model", action="store_true")
    parser.add_argument("--save_model_path", type=str, default=None)

    parser.add_argument("--do_sample", type=str2bool, default=False)
    parser.add_argument("--precision", type=str, default='mp', choices=['fp16', 'fp32', 'mp'])
    parser.add_argument("--optimizer", type=str, default='adamw', choices=['adam', 'adamw'])
    parser.add_argument("--scheduler", type=str, default='CosineScheduleWithWarmUp', choices=['ExponentialLR', 'TriStageLRScheduler', 'ReduceLROnPlateauScheduler', 'CosineScheduleWithWarmUp'])

    parser.add_argument("--num_labels", type=int, default=2)
    parser.add_argument("--label_smoothing", type=float, default=0.0)

    parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')

    parser.add_argument('--focal_usage', type=str2bool, default=False, help='boolean for usage of focal loss')
    parser.add_argument('--ldam_usage', type=str2bool, default=False, help='boolean for usage of focal loss')
    parser.add_argument('--new_cb_usage', type=str2bool, default=False, help='boolean for usage of focal loss')
    parser.add_argument('--new_cb_type', type=str, default='focal', help='Normal for Normal Focal loss, Class for Class Balanced Focal Loss')

    parser.add_argument('--focal_type', type=str, default='Normal', help='Normal for Normal Focal loss, Class for Class Balanced Focal Loss')
    parser.add_argument('--focal_gamma', type=float, default=2., help='gamma for focal loss')
    parser.add_argument('--focal_alpha', type=float, default=0.25, help='alpha for usage of focal loss')

    args = parser.parse_args()
    log(args)

    if world_size == 1:
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    elif world_size > 1:
        args.n_gpu = 0 if args.no_cuda else world_size

    # post-parsing args
    args.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # directories
    if args.data_dir == None:
        args.data_dir = join(abspath(dirname(__file__)), f'../data/{args.dataset_type}')
    args.out_dir = join(abspath(dirname(__file__)), f'../out/{args.downstream_task}')

    if args.downstream_task == 'summarization':
        args.use_pad_sequence_max = True
    args.use_pad_sequence_max = False

    args.max_context_length = args.max_sequence_length - 200

    assert args.accumulation_steps > 0
    assert args.max_sequence_length >= args.max_context_length, "Max Sequence 길이가 Max Context 길이보다 길어야 합니다!!!"

    assert not (args.mode == 'wte' and args.precision == 'fp16'), "WTE 모드는 FP16에서 실행되지 않습니다!!!"

    set_seed(args)

    return args

class Trainer(object):
    def __init__(self, args):
        self.args = args
        if world_size > 1:
            self.device = f'cuda:{self.args.local_rank}'
        elif world_size == 1:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.model, self.tokenizer = get_gpt('EleutherAI/gpt-neo-125M')
        self.model.to(self.device)

        if args.eval_only == True:
            self.train_set = None
        else:
            self.train_set = HumanevalDatasets()
        self.test_set = HumanevalDatasets()

        os.makedirs(self.get_save_path(), exist_ok=True)

        if self.args.eval_only == False:
            self.train_loader = DataLoader(self.train_set, batch_size=self.args.train_batch_size, shuffle=True, drop_last=True)
        elif self.args.eval_only == True:
            self.train_loader = None
        self.test_loader = DataLoader(self.test_set, batch_size=self.args.test_batch_size)

    def evaluate(self, epoch_idx=0):
        self.model.eval()

        loader = self.test_loader

        eval_b_cnt=0
        eval_loss=0
        with torch.no_grad():
            log(f"### START TEST ###")
            torch.cuda.empty_cache()

            predictions = []
            references = []
            names = []
            tests = []
            entry_points = []
            for i, (contexts, labels, name, entry_point, test) in enumerate(tqdm(loader, bar_format='{l_bar}{bar:10}{r_bar}')):
                predictions += self.model.generate(contexts, labels)
                references += labels
                names += name
                entry_points += entry_point
                tests.append(test)

                _loss,_ = self.model(contexts, labels)
                eval_b_cnt+=1
                eval_loss += _loss.item()

            eval_loss = eval_loss/eval_b_cnt
            perplexity = torch.exp(torch.tensor(eval_loss))

            log(f"{Colors.VIOLET}Test Epoch: {epoch_idx} Loss: {eval_loss} perplexity: {perplexity}{Colors.ENDC}\n")

            # Metric
            score = evaluate_metric(predictions, tests, self.args.dataset_type)
        if self.args.save_test_inference != None and global_rank == 0:
            with open(self.args.save_test_inference, 'a') as f:
                wr = csv.writer(f, delimiter="\t")
                for idx in range(len(references)):
                    wr.writerow([epoch_idx, names[idx], references[idx], predictions[idx], score[idx]])

        return eval_loss, perplexity

    def get_task_name(self):
        names = [self.args.model_name,
                self.args.downstream_task,
                self.args.mode,
                self.args.dataset_type]
        return "_".join(names)

    def get_save_path(self):
        if self.args.save_model_path != None:
            return join(self.args.save_model_path, self.get_task_name())

        return join(self.args.out_dir, self.args.model_name, self.args.mode, self.get_task_name())

    """ Gradient averaging. """
    def average_gradients(self, model):
        size = float(world_size)
        for name, param in model.named_parameters():
            if param.grad == None:
                continue
            torch.distributed.all_reduce(param.grad.data, op=torch.distributed.ReduceOp.SUM)
            param.grad.data /= size

    def get_checkpoint(self, epoch_idx, test_ppl, train_ppl):
        ckpt_name = ''
        if world_size == 1:
            ckpt_name = "epoch_{}_test_{}_train_{}.ckpt".format(epoch_idx, test_ppl, train_ppl)
        elif world_size > 1:
            ckpt_name = "epoch_{}_test_{}_train_{}_{}.ckpt".format(epoch_idx, test_ppl, train_ppl, global_rank)

        embedding = None
        if world_size > 1:
            embedding = self.model.model.module.state_dict()
        else:
            embedding = self.model.model.state_dict()
        return {'embedding': embedding,
                'test_ppl': test_ppl,
                'test_size': len(self.test_set),
                'ckpt_name': ckpt_name,
                'time': datetime.now(),
                'args': self.args}

    def save(self, best_ckpt):
        ckpt_name = best_ckpt['ckpt_name']
        path = self.get_save_path()
        os.makedirs(path, exist_ok=True)
        torch.save(best_ckpt, join(path, ckpt_name))

        log("# Checkpoint {} saved.".format(ckpt_name))

    def train(self):
        test_ppl = 100000
        best_ckpt = None
        params=[]

        if self.args.mode == 'finetune':
            for name, param in self.model.model.named_parameters():
                param.requires_grad = True
            params.append({'params': self.model.model.parameters(), 'lr': self.args.lr})
        elif self.args.mode == 'wte':
            for name, param in self.model.model.named_parameters():
                if 'wte' in name:
                    param.requires_grad = True
                    params.append({'params': param})
        else:
            raise NotImplementedError('wte/finetune 이외 mode는 지원하지 않습니다.')

        optimizer = None
        if self.args.optimizer == 'adam':
            optimizer = torch.optim.Adam(params, lr=self.args.lr, weight_decay=self.args.weight_decay)
        elif self.args.optimizer == 'adamw':
            optimizer = AdamW(params, lr=self.args.lr, correct_bias=True)
        else:
            raise NotImplementedError('adam/adamw 이외 optimizer는 지원하지 않습니다.')

        my_lr_scheduler = None

        if self.args.scheduler == 'ExponentialLR':
            my_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=self.args.decay_rate)
        elif self.args.scheduler == 'TriStageLRScheduler':
            total_steps = len(self.train_loader) * self.args.max_epochs // self.args.accumulation_steps
            warmup_steps = 4000 // self.args.accumulation_steps
            hold_steps = 8000 // self.args.accumulation_steps
            my_lr_scheduler = TriStageLRScheduler(
                optimizer,
                init_lr=self.args.lr*1e-4,
                peak_lr=self.args.lr,
                final_lr=self.args.lr*1e-5,
                init_lr_scale=0.01,
                final_lr_scale=0.05,
                warmup_steps=warmup_steps,
                hold_steps=hold_steps,
                decay_steps=total_steps-warmup_steps-hold_steps-200,
                total_steps=total_steps,
            )
        elif self.args.scheduler == 'ReduceLROnPlateauScheduler':
            my_lr_scheduler = ReduceLROnPlateauScheduler(
                optimizer,
                lr=self.args.lr)
        elif self.args.scheduler == 'CosineScheduleWithWarmUp':
            train_steps = len(self.train_loader) * self.args.max_epochs // self.args.accumulation_steps
            warmup_steps = int(train_steps * 0.1)
            my_lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_steps)
        else:
            raise NotImplementedError('ExponentialLR/TriStageLRScheduler/ReduceLROnPlateauScheduler 이외 scheduler는 지원하지 않습니다.')

        for epoch_idx in range(1, self.args.max_epochs+1):
            total_loss=0
            train_b_cnt=0

            log("### START TRAIN ###")
            torch.cuda.empty_cache()

            self.model.train()
            scaler = GradScaler()
            if world_size > 1:
                torch.distributed.barrier()
            for steps, batch in enumerate(tqdm(self.train_loader, bar_format='{l_bar}{bar:10}{r_bar}')):
                if world_size > 16: # Multi-node
                    if steps % 10 == 0:
                        log(f'{steps}/{len(self.train_loader)} @ {datetime.now()}')
                elif self.args.precision == 'mp': # mixed_precision
                    with autocast():
                        loss, _ = self.model(batch[0], batch[1])
                        loss = loss / self.args.accumulation_steps # Gradient Accumulation 적용
                        total_loss += loss.item()
                    scaler.scale(loss).backward()
                else:
                    loss, _ = self.model(batch[0], batch[1])
                    loss = loss / self.args.accumulation_steps # Gradient Accumulation 적용
                    total_loss += loss.item()
                    loss.backward()

                train_b_cnt+=1

                if world_size > 1:
                    if self.args.mode == 'wte':
                        self.average_gradients(self.model.model.module.transformer.wte)
                    elif self.args.mode == 'finetune':
                        self.average_gradients(self.model.model.module)

                if (steps+1) % self.args.accumulation_steps == 0:
                    if self.args.use_empty_cache == True:
                        torch.cuda.empty_cache()
                    if self.args.precision == 'mp': # mixed_precision
                        scaler.step(optimizer)
                        scaler.update()
                    else:
                        optimizer.step()
                    if self.args.use_empty_cache == True:
                        torch.cuda.empty_cache()
                    optimizer.zero_grad()

                    if self.args.scheduler == 'TriStageLRScheduler' or self.args.scheduler == 'CosineScheduleWithWarmUp':
                        my_lr_scheduler.step()
                    elif self.args.scheduler == 'ReduceLROnPlateauScheduler' and train_b_cnt != 0:
                        my_lr_scheduler.step(total_loss/train_b_cnt) # Train Loss

                if self.args.print_train_metric and steps%10==0:
                    if self.args.scheduler == 'CosineScheduleWithWarmUp':
                        log(f"Train LR: {my_lr_scheduler.get_last_lr()[0]:.10f} Epoch {epoch_idx} Step: {steps} Loss: {total_loss/train_b_cnt:.5f} perplexity: {torch.exp(torch.tensor(total_loss/train_b_cnt)):.5f}")
                    else:
                        log(f"Train LR: {my_lr_scheduler.get_last_lr():.6f} Epoch {epoch_idx} Step: {steps} Loss: {total_loss/train_b_cnt:.5f} perplexity: {torch.exp(torch.tensor(total_loss/train_b_cnt)):.5f}")

            if self.args.scheduler == 'ExponentialLR':
                my_lr_scheduler.step()

            if train_b_cnt != 0:
                total_loss = total_loss/train_b_cnt
            train_ppl = torch.exp(torch.tensor(total_loss))

            log("Train LR: {} Epoch {} Loss: {} perplexity: {}".format(my_lr_scheduler.get_last_lr(),epoch_idx, total_loss, train_ppl))

            if self.args.save_test_inference != None and global_rank == 0:
                with open(self.args.save_test_inference, 'a') as f:
                    wr = csv.writer(f, delimiter="\t")
                    wr.writerow(['------------------------------', '------------------------------'])

            test_ppl = None
            if self.args.no_eval != True:
                _, test_ppl = self.evaluate(epoch_idx)

            best_ckpt = self.get_checkpoint(epoch_idx, test_ppl, train_ppl)
            if self.args.save_model == True:
                self.save(best_ckpt)

            torch.cuda.empty_cache()
        return best_ckpt

def main():
    args = construct_generation_args()

    if args.save_test_inference != None and global_rank == 0:
        with open(args.save_test_inference, 'w') as f:
            wr = csv.writer(f, delimiter="\t")
            wr.writerow(['정답', '추론'])

    trainer = Trainer(args)

    if args.eval_only == True:
        trainer.evaluate()
    else:
        trainer.train()


if __name__ == '__main__':
    main()

NameError: name '__file__' is not defined

채점을 위한 코드

In [None]:
from evaluate import load
import os
from tqdm import tqdm
import pandas as pd
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

def evaluate_metric(predictions, entry_points, tests, dataset_type):
    if dataset_type == 'humaneval':
        length = len(entry_points)
        code_eval = load('code_eval')
        score = []
        for i in tqdm(range(length)):
            candidate = [["def "+ entry_points[i] +'\n' + predictions[i]] for _ in range(len(tests))]
            pass_at_k, results = code_eval.compute(references=tests[i].strip(), predictions=candidate)
            score.append(pass_at_k['pass@1'])
        return score

In [None]:
main()