### This notebook is based on the [course](https://huggingface.co/course/en/chapter7/6?fw=pt) from hugging face.
* This job was trained on NYU HPC

In [1]:
import os, torch
os.environ["HF_HOME"] = f'/scratch/{os.environ["USER"]}/huggingface'
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from accelerate import Accelerator 
from collections import defaultdict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, \
                         get_scheduler, DataCollatorForLanguageModeling
from torch.nn import CrossEntropyLoss 
from torch.optim import AdamW
from torch.utils.data.dataloader import DataLoader 

from transformers import (
    AdamW,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

if 0 and torch.cuda.device_count():
    print (torch.cuda.device_count(), 'CUDA device(s) count be found on this system')
elif 0:
    print('No CUDA devices on this system')

### Build new tokenizer

### Build new Python Code dataset

### Preprocessing

### Training with 🤗 Accelerate
```
from accelerate import Accelerator 

accelerator = Accelerator()
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
	train_dataloader, eval_dataloader, model, optimizer
) 
accelerator.backward(loss)
```

In [2]:
def training_function():  
    accelerator = Accelerator() 
    hyperparameters = {
        "learning_rate": 2e-5,
        "num_epochs": 3,
        "train_batch_size": 8, # Actual batch size will this x 8
        "eval_batch_size": 32, # Actual batch size will this x 8
        "seed": 42,
    }
    set_seed(hyperparameters["seed"])
        
    tokenizer = AutoTokenizer.from_pretrained("new_tokenizer_dir")   
    tokenizer.pad_token = tokenizer.eos_token  

    context_length = 128 
    config = AutoConfig.from_pretrained(
        "gpt2",
        vocab_size=len(tokenizer),
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    model = GPT2LMHeadModel(config)
    #print(tokenizer.vocab_size, model.lm_head.out_features) 
    accelerator.print(f"GPT-2 size: {sum(t.numel() for t in model.parameters())/1e6:.1f}M parameters")
    def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
        weight_decay = 0.1 
        params_with_wd, params_without_wd = [], []
        for n, p in model.named_parameters():
            if any(nd in n for nd in no_decay):
                params_without_wd.append(p)
            else:
                params_with_wd.append(p)
        return [
            {"params": params_with_wd, "weight_decay": weight_decay},
            {"params": params_without_wd, "weight_decay": 0.0},
        ]
    
    optimizer = AdamW(get_grouped_params(model), lr=5e-4)
    
    tokenized_datasets = DatasetDict.load_from_disk(f'/scratch/{os.environ["USER"]}/new_tokenized_datasets_dir')
    # Dataset.from_dict(tokenized_datasets['train'][:100]) 
    tokenized_datasets.set_format("torch")
    train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
    eval_dataloader  = DataLoader(tokenized_datasets["valid"], batch_size=32)
     
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    
    num_train_epoch = 3
    num_step_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epoch * num_step_per_epoch
    
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=1_000,
        num_training_steps=num_training_steps,
    )
    
    keytoken_ids = []
    for keyword in ['plt','pd','sk','fit','predict','plt',
                    'pd','sk','fit','predict','testtest']:
        ids = tokenizer([keyword]).input_ids[0]
        if len(ids) == 1:
            keytoken_ids.append(ids[0])
        else:
            print(f"Keyword has not single token: {keyword}")
        
    def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
        # Shift so that tokens < n predict n
        shift_labels = inputs[..., 1:].contiguous()
        shift_logits = logits[..., :-1, :].contiguous()
        # Calculate per-token loss
        loss_fct = CrossEntropyLoss(reduce=False)
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        # Resize and average loss per sample
        loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
        # Calculate and scale weighting
        weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
            axis=[0, 2]
        )
        weights = alpha * (1.0 + weights)
        # Calculate weighted average
        weighted_loss = (loss_per_sample * weights).mean()
        return weighted_loss
    
    output_dir = f'/scratch/{os.environ["USER"]}/new_model_dir'
    eval_steps = 5_000
    
    progress_bar = tqdm(range(num_training_steps), disable=not accelerator.is_local_main_process)
    for epoch in range(num_train_epoch):
        
        model.train()
        for step, batch in enumerate(train_dataloader, start=0):
            output = model(batch["input_ids"]) 
            loss   = keytoken_weighted_loss(batch["input_ids"], output.logits, keytoken_ids) 
            optimizer.zero_grad()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            progress_bar.update(1)  
            
        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader, start=0):
            with torch.no_grad():
                outputs = model(batch["input_ids"], labels=batch["input_ids"])
            losses.append(accelerator.gather(outputs.loss))
        loss = torch.mean(torch.cat(losses)) 
        eval_loss = loss.item()  
        accelerator.print({"loss/eval": eval_loss})
        # It seem I don't save model in the main process. 
        # See https://github.com/huggingface/accelerate/issues/325 for more discussion.
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)  

In [None]:
from accelerate import notebook_launcher

notebook_launcher(training_function, num_processes=3)

Launching training on 3 GPUs.
