# Capstone

In [1]:
!pip install -q  bitsandbytes einops wandb

[33mDEPRECATION: pytorch-lightning 1.8.3 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [2]:
!pip install --upgrade datasets

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: pytorch-lightning 1.8.3 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [3]:
from datasets import load_dataset

# dataset_name = "cerebras/SlimPajama-627B/book"
dataset_name = "DKYoon/SlimPajama-6B"

dataset_train = load_dataset(dataset_name, cache_dir=f"dataset/slimpajama6B", split="train", num_proc=4)

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

#### Setup tokenizer

In [4]:
import torch
from transformers import  AutoTokenizer

model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


#### Setup configuration and create model

In [5]:
from transformers import AutoModelForCausalLM, AutoConfig



config = AutoConfig.from_pretrained(
    model_name,
    vocab_size=len(tokenizer),
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    trust_remote_code=True
)

model_phi2_scratch = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

model_phi2_scratch.config.use_cache = False

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


#### Training

In [37]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 16
optim = "adamw_bnb_8bit"
save_steps = 5000
logging_steps = 20
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 20000
warmup_ratio = 0.03
lr_scheduler_type = "constant"
context_length = 256

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to='tensorboard',
    remove_unused_columns=False
)

PyTorch: setting up devices


In [38]:
from transformers import PreTrainedTokenizerFast
from transformers import Trainer, DataCollatorForLanguageModeling, DataCollator

class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __call__(self, records, **kwargs):

        records = [x['text'] for ix, x in enumerate(records)]
        batch = self.tokenizer(records, padding="max_length", 
                                         truncation=True, 
                                         max_length=context_length,
                                         return_tensors="pt"
                                        )
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        labels = batch["input_ids"].clone()
        if self.tokenizer.pad_token_id is not None:
            labels[labels == self.tokenizer.pad_token_id] = -100
        batch["labels"] = labels
        return batch


CustomDataCollator = CustomDataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False)

In [39]:
trainer = Trainer(
    model=model_phi2_scratch,
    train_dataset=dataset_train,
    tokenizer=tokenizer,
    data_collator=CustomDataCollator,
    args=training_arguments,
)

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [40]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 5489000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 16
  Total optimization steps = 20000
  Number of trainable parameters = 2775049335


Step,Training Loss
20,8.6058
40,7.7785
60,7.3383
80,7.0151
100,6.8222
120,6.6923
140,6.5482
160,6.4319
180,6.3537
200,6.2704
