In [None]:
!pip install optimum
!pip install auto-gptq

# Finetuning Quantized Llama-2-7b-Chat-GPTQ using PEFT

## Working with LLAMA 7B quantized in 4bit

In [None]:
model_checkpoint = "TheBloke/Llama-2-7b-Chat-GPTQ"

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

In [9]:
from peft import prepare_model_for_kbit_training

In [10]:
quantization_config = GPTQConfig(bits=4,use_exllama=False)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint,quantization_config=quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.GPTQ: 'gptq'>,
 'bits': 4,
 'tokenizer': None,
 'dataset': None,
 'group_size': 128,
 'damp_percent': 0.01,
 'desc_act': False,
 'sym': True,
 'true_sequential': True,
 'use_cuda_fp16': False,
 'model_seqlen': None,
 'block_name_to_quantize': None,
 'module_name_preceding_first_block': None,
 'batch_size': 1,
 'pad_token_id': None,
 'use_exllama': False,
 'max_input_length': None,
 'exllama_config': {'version': <ExllamaVersion.ONE: 1>},
 'cache_block_outputs': True,
 'modules_in_block_to_quantize': None}

### Preprocessing model for quantized training

Gradient checkpointing strikes a compromise between saving activations from the forward pass and  forget all activations during the forward pass and recompute them on demand during the backward pass. It strategically saves selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. This lowers memory load.For details refer to [@HuggingFace](https://huggingface.co/docs/transformers/v4.18.0/en/performance)

In [13]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

### Convert model to peft mdoel

In [14]:
from peft import LoraConfig, get_peft_model

LoRA is a parameter-efficient fine-tuning (PEFT) method. It decomposes large attention matrix into two low rank matrices and drastically reduces number of parameters to be trained. The attention dimension is the rank of these low rank matrices. It detemines the degree of information reatined. Smaller rank means more compression and high degree of information loss. Research Papaer - [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)

In [15]:
lora_config = LoraConfig(r=8,
                        lora_alpha=32,
                        target_modules = ["k_proj","o_proj","q_proj","v_proj"],
                        lora_dropout = 0.05,
                        bias = 'none',
                        task_type = 'CAUSAL_LM')

In [16]:
model = get_peft_model(model,lora_config)

In [17]:
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 270,798,848 || trainable%: 3.097726619575575


### Dataset

- flytech/python-codes-25k form HiggingFace
- instruction: The instructional task to be performed / User input.
- input: Very short, introductive part of AI response or empty.
- output: Python code that accomplishes the task.
- text: All fields combined together.

In [18]:
from datasets import load_dataset,load_from_disk

In [None]:
raw_dataset = load_dataset('flytech/python-codes-25k')

In [20]:
raw_dataset = raw_dataset['train'].train_test_split(test_size=0.1,seed=1)

In [21]:
raw_dataset['train'] = raw_dataset['train'].select([*range(20000)])
raw_dataset['test'] = raw_dataset['train'].select([*range(100)])

In [22]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'input', 'output', 'instruction'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'input', 'output', 'instruction'],
        num_rows: 100
    })
})

In [23]:
raw_dataset['train'][0].keys()

dict_keys(['text', 'input', 'output', 'instruction'])

In [24]:
raw_dataset['train'][0]['text']

"Code an optimization problem using Python that finds the maximum of a function with two optimization variables and a given constraint f(x, y) = 3x + 2y\n Constraint: x + y ? 10 Setting things in motion! Here we go! ```python\nfrom scipy.optimize import minimize\n\ndef f(x):\n return 3*x[0] + 2*x[1]\n\ndef con(x):\n return x[0] + x[1] - 10\n\ncons = ({'type': 'ineq', 'fun': con})\n\nresult = minimize(f, [0, 0], method='SLSQP', constraints=cons)\nx = result.x\n\nprint('Solution:', x)\nprint('Maximum value:', result.value)\n```"

In [None]:
tokenized_dataset = raw_dataset.map(lambda samples: tokenizer(samples['text']), batched=True)

#### Data Collator

In [26]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

#### Optimizer

In [27]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [28]:
model.config

LlamaConfig {
  "_name_or_path": "TheBloke/Llama-2-7b-Chat-GPTQ",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.01,
    "dataset": null,
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "module_name_preceding_first_block": null,
    "modules_in_block_to_quantize": null,
    "pad_token_id": null,
    "quant_method": "gptq

In [29]:
model.generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 0
}

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_steps=10,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    warmup_steps=8,
    lr_scheduler_type="linear",
    learning_rate=2e-4,
    save_strategy='epoch',
    fp16=True,
    report_to='none'

)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

model.config.use_cache = False

In [31]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.7086,0.705514


TrainOutput(global_step=625, training_loss=0.8139624240875244, metrics={'train_runtime': 37620.5067, 'train_samples_per_second': 0.532, 'train_steps_per_second': 0.017, 'total_flos': 4350923021058048.0, 'train_loss': 0.8139624240875244, 'epoch': 1.0})