In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, EvalPrediction, TrainingArguments, TrainerControl, TrainerState
import math
#from transformers.trainer_pt_utils import PredictionOutput
from peft import LoraConfig, get_peft_model, LoraModel
from peft import prepare_model_for_kbit_training, PeftModel, PeftConfig
import transformers
from transformers import pipeline
from torch.utils.data import Dataset
from datasets import load_dataset#, Dataset
import datasets
import numpy as np
from transformers.trainer_callback import TrainerCallback
from typing import List, Optional  # Add the import statement at the beginning of your file
from transformers import logging
from typing import Dict, Optional, Any
from tqdm import tqdm
from transformers import TrainerState
from datetime import datetime
import copy
from transformers import TrainerControl, TrainerState
import tempfile
from sklearn.model_selection import train_test_split, KFold
import pickle
from random import sample
"""
from peft import prepare_model_for_kbit_training

from transformers import TrainerCallback
from torch.cuda.amp import autocast
from torch.optim import AdamW
"""

In [None]:
def create_subset(dataset, num_examples):
    indices = sample(range(len(dataset)), num_examples)
    return dataset.select(indices)

def filter_datasets_for_use_case(datasets, use_case):
    filtered_datasets = {}
    for key, value in datasets.items():
        if value[use_case]:
            filtered_datasets[key] = value[use_case]
    return filtered_datasets

def split_datasets(data_dict, ratio=0.7, random_state=None):
    train_data = {}
    test_data = {}
    validation_indices = {}

    for key, value in data_dict.items():
        train, test, train_indices, test_indices = train_test_split(value, range(len(value)), train_size=ratio, random_state=random_state)
        train_data[key] = train
        test_data[key] = test
        validation_indices[key] = test_indices

    return train_data, test_data, validation_indices

def unique_elements(lst):
    result = []
    seen = set()
    for item in lst:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

class PerplexityLoggingCallback(TrainerCallback):
    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,
                    metrics: Dict[str, float], prefix=None, **kwargs):
        if prefix is None:
            prefix = "eval"
        eval_loss_key = f"{prefix}_loss"
        if eval_loss_key in metrics:
            loss = metrics[eval_loss_key]
            metrics[f"{prefix}_perplexity"] = math.exp(loss)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

class CustomDataset(Dataset):
    def __init__(self, tensor_list):
        self.tensor_list = tensor_list

    def __getitem__(self, idx):
        return self.tensor_list[idx]

    def __len__(self):
        return len(self.tensor_list)
        
def get_sequences(text, tokenizer, seq_length=768, stride_ratio=0.5):
    all_token_ids = tokenizer.encode(text)

    #Generate sequences using sliding window approach
    stride_length = int(seq_length * stride_ratio)
    sequences = []
    for i in range(0, len(all_token_ids) - seq_length +1, stride_length):
        input_ids = all_token_ids[i:i+seq_length]
        sequences.append(input_ids)
    
    #Truncate the last sequence if it less than seq_length
    last_sequence = sequences[-1]
    if len(last_sequence) < seq_length:
        last_sequence = last_sequence + [tokenizer.pad_token_id] * (seq_length - len(last_sequence))
        sequences[-1] = last_sequence

    #Drop any remaining sequences that are less than seq_length
    sequences = [sequence for sequence in sequences if len(sequence) == seq_length]

    return sequences

def evaluate(model, dataloader, device, max_eval_steps):
    model.eval()
    losses = []
    for step, batch in enumerate(dataloader):
        # Extract input_ids and convert them to tensors
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device) if 'labels' in batch else None

        with torch.no_grad():
            input_dict = {'input_ids': input_ids, 'labels': labels}
            outputs = model(**input_dict)
         
        loss = outputs.loss.repeat(input_ids.shape[0])
        losses.append(loss.detach())
        if max_eval_steps > 0 and step >= max_eval_steps: break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

class CustomTrainer(Trainer):
    def __init__(self, *args, max_eval_steps=0, **kwargs):
        super().__init__(*args, **kwargs)
        self.best_perplexity = float("inf")
        self.best_model_state_dict = None
        self.no_improvement_counter = 0
        self.passed_epoch_steps = False
        self.max_eval_steps = max_eval_steps  # Add max_eval_steps as an attribute

    def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix='eval'):
        eval_loss, perplexity = evaluate(self.model, dataloader, self.args.device, self.max_eval_steps)
    
        # Check if epoch_steps are surpassed
        if self.state.epoch >= 1:
            self.passed_epoch_steps = True
    
        # Check for improvements if the epoch_steps are surpassed
        if self.passed_epoch_steps:
            if perplexity < self.best_perplexity:
                self.best_perplexity = perplexity
                self.best_model_state_dict = {k: v.clone().to('cpu') for k, v in self.model.state_dict().items()}
                self.no_improvement_counter = 0
            else:
                self.no_improvement_counter += 1
    
        # Stop training, load the best state_dict in the model, and return the best_model if the perplexity did not improve 3 times consecutively
        if self.no_improvement_counter == 3:
            if self.best_model_state_dict:
                self.model.load_state_dict(self.best_model_state_dict)
            self.model.to(self.args.device)
            self.control.should_training_stop = True
            print("Training stopped, best model loaded with Perplexity:", self.best_perplexity)
    
        self.log({
            "eval_loss": eval_loss,
            "perplexity": perplexity,
            "epoch": self.state.epoch,
        })
    
        # Define num_samples as the total number of samples in the dataloader
        #num_samples = len(dataloader.dataset)
    
        # Initialize an instance of EvalPrediction without the 'metrics' keyword argument 
        #eval_prediction = EvalPrediction(predictions=None, label_ids=None, num_samples=num_samples)
        eval_prediction = EvalPrediction(predictions=None, label_ids=None)
        
        # Define num_samples as the total number of samples in the dataloader
        num_samples = len(dataloader.dataset)
    
        # Add the num_samples attribute to the eval_prediction instance
        eval_prediction.num_samples = num_samples
    
        # Set the metrics dictionary
        eval_prediction.metrics = {"eval_loss": eval_loss}
    
        return eval_prediction
    
    def get_completed_steps(self):
        return self.state.global_step



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#sample # of character's from combined_text
sample=False
#if true, what # of characters to sample (this * avg_tokens_per_char = rough # of tokens)
#(10000*.6)/2/128*.1
s_size = 10000
seq_length = 128
#seq_length = 128
batch_size = 16
epoch_steps_warmup_ratio = 1/3
epochs = 10
model_id = "EleutherAI/gpt-neo-1.3B"
#model_id = "EleutherAI/gpt-neo-125M"
warm_ratio = 1/2
train_fraction = 0.9
epochs = 3
gradient_accumulation_steps = 16
seed = 42

#model_id = "openlm-research/open_llama_3b_600bt_preview"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    #bnb_4bit_quant_type="nf4",
    bnb_4bit_quant_type="fp4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    #target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
#model_id = '/root/.cache/huggingface/hub/models--EleutherAI--gpt-neo-1.3B/snapshots/0f35a20339a9e208bc061570981180cfb87c9572'

peft_config = PeftConfig.from_pretrained('bits')
model = AutoModelForCausalLM.from_pretrained(
        peft_config.base_model_name_or_path,
        quantization_config=bnb_config, device_map={"":0}
        #load_in_8bit=False,
        #return_dict=True,
        #device_map="auto",
        #torch_dtype=torch.float16,
        #low_cpu_mem_usage=True,
)


#model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

model = get_peft_model(model, lora_config)

print_trainable_parameters(model)
model.config.use_cache = False

In [4]:

print("before load")
with open('../venv_train_neo/datasets_dict.pkl', 'rb') as f:
    datasets_dict = pickle.load(f)
    
finetune_datasets = filter_datasets_for_use_case(datasets_dict, 'finetune')
train_data_list, valid_data_list, valid_data_indices = split_datasets(finetune_datasets, ratio=0.7, random_state=seed)

train_data_list = [record for dataset in train_data_list.values() for record in dataset]
valid_data_list = [record for dataset in valid_data_list.values() for record in dataset]

combined_train = tokenizer.eos_token.join(train_data_list)
combined_valid = tokenizer.eos_token.join(valid_data_list)

train_sequences = get_sequences(combined_train, tokenizer, seq_length=seq_length)
valid_sequences = get_sequences(combined_valid, tokenizer, seq_length=seq_length)

train_epoch_steps = (len(train_sequences) / (batch_size * gradient_accumulation_steps))
valid_epoch_steps = (len(valid_sequences) / (batch_size * gradient_accumulation_steps))

max_train_steps = int(train_epoch_steps * epochs)

train_dataset = datasets.Dataset.from_dict({"input_ids": train_sequences})
valid_dataset = datasets.Dataset.from_dict({"input_ids": valid_sequences})


before load


Token indices sequence length is longer than the specified maximum sequence length for this model (76024 > 2048). Running this sequence through the model will result in indexing errors


In [5]:
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 2048)

In [6]:
from random import sample
# Replace 'valid_dataset' with your current evaluation dataset variable
num_eval_examples = 16  # Set the number of examples you want to use for evaluation
subset_valid_dataset = create_subset(valid_dataset, num_eval_examples)

In [None]:

#trainer = Trainer(
trainer = CustomTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=subset_valid_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size = batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=int(train_epoch_steps * warm_ratio),
        evaluation_strategy='steps',
        max_steps=max_train_steps,
        learning_rate=1e-5,
        fp16=True,  # Add a keyword here
        logging_steps=int(np.clip(np.round(train_epoch_steps/10),1,1)),
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    callbacks=[PerplexityLoggingCallback()],  # Add the custom callback
)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  attn_weights = torch.where(causal_mask, attn_weights, mask_value)


Step,Training Loss,Validation Loss,Perplexity
1,3.1816,3.003402,20.153994
2,3.2268,3.003107,20.148032
3,3.1991,3.002907,20.144006


In [None]:
initial_completed_steps = trainer.get_completed_steps()

valid_steps = max(1, int(np.round((initial_completed_steps/train_epoch_steps * valid_epoch_steps),0)))

#valid_steps = int(np.clip(np.round(initial_completed_steps/train_epoch_steps*valid_epoch_steps,0),1,1))

In [None]:


valid_trainer = CustomTrainer(
    model=trainer.model,
    max_eval_steps=valid_steps,  # Pass the valid_steps here
    train_dataset=valid_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size = batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        max_steps=valid_steps,
        learning_rate=1e-5,
        fp16=True,
        logging_steps=int(np.clip(np.round(valid_epoch_steps/10),1,1)),
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

valid_trainer.train()

In [None]:
post_train_trainer.model.save_pretrained('./bitsft')
post_train_trainer.model.config.use_cache = True
generator = pipeline('text-generation', model=post_train_trainer.model, tokenizer = tokenizer)
results = generator(r"Context:\nPrompt: Finish the quote. 'To live well...'\nResponse:", do_sample=True, min_length=50, max_length=200)