In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, Trainer, pipeline
from peft import LoraConfig
from datasets import Dataset
import datasets
from trl import SFTTrainer, PPOTrainer

from tqdm import tqdm
#load model name
# model_name = "qwen/Qwen2.5-0.5B"
import argparse

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer 

In [3]:
based_model = GPT2LMHeadModel.from_pretrained('gpt2-xl',
                                              torch_dtype = torch.bfloat16,
                                              device_map= {"":0})

In [5]:
peft_params = LoraConfig(
    r=256,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')

In [34]:
dataset = datasets.load_dataset("gsm8k", "main")

train_dataset, test_dataset = dataset['train'], dataset['test']

print(len(train_dataset))
print(len(test_dataset))


7473
1319


In [36]:
half_dataset = test_dataset.select(range(int(len(test_dataset) // 1.5)))

In [37]:
test_dataset = [i for i in test_dataset if i not in half_dataset]

In [43]:
test_dataset = Dataset.from_list(test_dataset)

In [38]:
len(test_dataset)

440

In [39]:
#add half of the test dataset to the train dataset
train_dataset = datasets.concatenate_datasets([train_dataset, half_dataset])

In [40]:
len(train_dataset)

8352

In [None]:
def preprocess_function(examples):
    inputs = [f"<|im_start|>user\n{prompt}<|im_end|>\n" for prompt in examples["question"]]
    # print(inputs[0])
    targets = [f"<|im_start|>assistant\n{completion}<|im_end|>\n" for completion in examples["answer"]]
    # print(targets[0])
    # model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_inputs = tokenizer(inputs,
                            #  max_length=512, 
                            padding = False
                             )
    # labels = tokenizer(targets, max_length=512, truncation=True, padding = True)
    labels = tokenizer(targets,
                      #  max_length=512, 
                      padding = False
                       )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)

tokenized_eval_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

Map: 100%|██████████| 440/440 [00:00<00:00, 1568.36 examples/s]


In [None]:
trainer.model

In [46]:
training_params = TrainingArguments(
    output_dir="./gpt2-xl-results",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    logging_steps=50,
    learning_rate=2e-4,
    logging_dir="./logs",
    save_strategy="epoch",
    # fp16=True,
    bf16 = True,
    optim="adamw_torch"
)

In [47]:
trainer = SFTTrainer(
    model=based_model,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval_dataset,
    peft_config=peft_params,
    max_seq_length=None,
    args=training_params,
    packing=False,
    # tokenizer = tokenizer,
    
)



In [48]:
trainer.train()

Step,Training Loss
50,2.5884
100,1.9377
150,1.9216
200,1.9306
250,1.8306
300,1.8061
350,1.8683
400,1.8417
450,1.7697
500,1.8294


TrainOutput(global_step=41760, training_loss=1.562499396645703, metrics={'train_runtime': 4507.7048, 'train_samples_per_second': 9.264, 'train_steps_per_second': 9.264, 'total_flos': 2.8145685321024e+16, 'train_loss': 1.562499396645703, 'epoch': 5.0})

In [None]:
major,  = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)
    bf16 = True
    fp16 = False

Your GPU supports bfloat16: accelerate training with bf16=True


In [49]:
trainer.model.save_pretrained("gpt22-xl-results")

In [50]:
trainer.tokenizer.save_pretrained("gpt22-xl-results")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('gpt22-xl-results/tokenizer_config.json',
 'gpt22-xl-results/special_tokens_map.json',
 'gpt22-xl-results/vocab.json',
 'gpt22-xl-results/merges.txt',
 'gpt22-xl-results/added_tokens.json',
 'gpt22-xl-results/tokenizer.json')

In [None]:
trainer.save_model(model_name)

In [None]:
#turn off wandb
import os
os.environ["WANDB_DISABLED"] = "true"