In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets

[0m

In [2]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [3]:

from datasets import load_dataset


train_dataset = load_dataset('squad_v2', split='train')
eval_dataset = load_dataset('squad_v2', split='validation')


In [4]:
print(train_dataset)
print(eval_dataset)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
})
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [8]:

def generate_and_tokenize_prompt(data_point):
    answer_text = ' '.join(data_point["answers"]["text"])
    full_prompt =f""""Analyze the provided context and generate a relevant question and answer. 
    Dissect the context into core components to understand the reasoning required.
    Formulate a question that naturally arises from the context.
    Develop a coherent answer by connecting logical threads within the context.

    ### Context:
    {data_point["context"]}
    
    
    ### Question:
    {data_point["question"]}
    ### Answer
    {answer_text}
    """
    return tokenize(full_prompt)

In [10]:

tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [12]:
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

In [13]:
print(tokenized_train_dataset[4])

{'id': '56bf6b0f3aeaaa14008c9602', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'In which decade did Beyonce become famous?', 'answers': {'text': ['late 1990s'], 'answer_start': [276]}, 'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [16]:
print(len(tokenized_train_dataset[4]['input_ids']))

512


In [14]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [15]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)
print(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
     

In [16]:
!pip install -q wandb -U

import wandb, os
wandb.login()

wandb_project = "mistral-qa-finetunev1.4"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project
    os.environ["WANDB_RESUME"] = "must"
    os.environ["WANDB_RUN_ID"] = "f5rpfdmw"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m

[34m[1mwandb[0m: Currently logged in as: [33mvpmb[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [28]:
wandb.init(id="f5rpfdmw", resume="must")

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113907299780597, max=1.0…

Problem at: /tmp/ipykernel_220/2155807707.py 1 <module>


KeyboardInterrupt: 

In [22]:
wandb.finish()

In [29]:
!wandb init --reset

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
import transformers
import numpy as np
from torch.utils.data import Subset
from datetime import datetime


project = "qa-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name


tokenizer.pad_token = tokenizer.eos_token

eval_batch_size = 10
desired_eval_steps = 25
desired_eval_size = eval_batch_size * desired_eval_steps

subset_indices = np.random.choice(len(tokenized_val_dataset), desired_eval_size, replace=False)
eval_subset = Subset(tokenized_val_dataset, subset_indices)

# resume_from_checkpoint = True
trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=eval_subset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=10000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=True,
        resume_from_checkpoint = True,
        save_total_limit = 3,
        load_best_model_at_end=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=200,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train(resume_from_checkpoint=True)



Step,Training Loss,Validation Loss
2250,1.2236,1.256102
2300,1.1671,1.257625
2350,1.1612,1.254814
2400,1.1676,1.256362
2450,1.1709,1.255776
2500,1.2135,1.253161
2550,1.1661,1.255794
2600,1.1778,1.255828
2650,1.2145,1.255422
2700,1.1805,1.255111




KeyboardInterrupt: 

In [21]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token




    



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, "mistral-qa-finetune/checkpoint-9200")

In [35]:


eval_prompt = """ 

    
### Context : The deal includes more than $1 billion in new compensation and benefit plan funding, along with outsized gains to the traditional residuals formulas. It offers a new compensation model for performers working in streaming, with a substantial bonus on top of existing residuals structures, plus compensation escalation for principal and background actors. Additionally, the deal establishes detailed informed consent and compensation guardrails for the use of AI, hair and makeup equity, meaningful protections for the casting process, sexual harassment prevention protections and more.
### Instructions : Generate diverse sets of questions and answers based on the context
### Question :
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=256, pad_token_id=2,)[0],repetition_penalty=2,skip_special_tokens=False))

<s>  

    
### Context : The deal includes more than $1 billion in new compensation and benefit plan funding, along with outsized gains to the traditional residuals formulas. It offers a new compensation model for performers working in streaming, with a substantial bonus on top of existing residuals structures, plus compensation escalation for principal and background actors. Additionally, the deal establishes detailed informed consent and compensation guardrails for the use of AI, hair and makeup equity, meaningful protections for the casting process, sexual harassment prevention protections and more.
### Instructions : Generate diverse sets of questions and answers based on the context
### Question :
What does the deal offer for performers working in streaming?
### Answer :
a new compensation model
### Question :
What does the deal offer for performers working in streaming?
### Answer :
a substantial bonus
### Question :
What does the deal offer for performers working in streaming?
