# Capstone

In [1]:
from datasets import load_dataset

# dataset_name = "cerebras/SlimPajama-627B/book"
dataset_name = "DKYoon/SlimPajama-6B"

dataset_train = load_dataset(dataset_name, cache_dir=f"dataset/slimpajama6B", split="train", num_proc=4)

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

#### Setup tokenizer

In [2]:
import torch
from transformers import  AutoTokenizer

model_name = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Setup configuration and create model

In [3]:
from transformers import AutoModelForCausalLM, AutoConfig



config = AutoConfig.from_pretrained(
    model_name,
    vocab_size=len(tokenizer),
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    trust_remote_code=True
)

model_phi2_scratch = AutoModelForCausalLM.from_config(config, trust_remote_code=True)

model_phi2_scratch.config.use_cache = False

#### Training

In [4]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 16
optim = "adamw_bnb_8bit"
save_steps = 5000
logging_steps = 20
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 20000
warmup_ratio = 0.03
lr_scheduler_type = "constant"
context_length = 256

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to='tensorboard',
    remove_unused_columns=False
)

In [5]:
from transformers import PreTrainedTokenizerFast
from transformers import Trainer, DataCollatorForLanguageModeling, DataCollator

class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __call__(self, records, **kwargs):

        records = [x['text'] for ix, x in enumerate(records)]
        batch = self.tokenizer(records, padding="max_length", 
                                         truncation=True, 
                                         max_length=context_length,
                                         return_tensors="pt"
                                        )
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        labels = batch["input_ids"].clone()
        if self.tokenizer.pad_token_id is not None:
            labels[labels == self.tokenizer.pad_token_id] = -100
        batch["labels"] = labels
        return batch


CustomDataCollator = CustomDataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False)

In [10]:
from datasets import Dataset
from transformers import  AutoTokenizer
from datasets import concatenate_datasets

custom_dataset = Dataset.load_from_disk("./custom_dataset")

dataset_train = dataset_train.remove_columns([col for col in dataset_train.column_names if col != "text"]) 

combined_dataset = concatenate_datasets([dataset_train, custom_dataset])


In [11]:
trainer = Trainer(
    model=model_phi2_scratch,
    train_dataset=combined_dataset,
    tokenizer=tokenizer,
    data_collator=CustomDataCollator,
    args=training_arguments,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [13]:
trainer.train()

Step,Training Loss
20,9.4723
40,8.0937
60,7.836
80,7.532
100,7.2826
120,7.1186
140,6.9606
160,6.8783
180,6.8224
200,6.7565


KeyboardInterrupt: 