In [None]:
!pip install torch accelerate bitsandbytes datasets transformers peft trl scipy

In [None]:
!nvidia-smi

In [None]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset
from torch import cuda, bfloat16
import transformers

import torch
import torch.nn as nn

***Get Model***

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
#model_id = 'meta-llama/Llama-2-13b-chat-hf'
model_id = "meta-llama/Llama-2-7b-hf"

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'


In [None]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=True
)

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=True
)
model.eval()
print(f"Model loaded on {device}")

In [None]:
mem = model.get_memory_footprint()
print("Memory footprint: {} ".format(mem))

# should be (7B) 7,000,000,000*4(Int4) / 8(8 bits per byte) = 3,500,000,000 = 3.5GB
# actual (7B)  3,829,940,224 (not all weights become int 4)
# actual (13B)  7,083,970,560  (not all weights become int 4)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=True
)

***Get Dataset***

In [None]:
# Load the dataset from Hugging Face
from datasets import load_dataset

dataset = load_dataset("kaist-ai/CoT-Collection", split="train")

In [None]:
print(f'Number of records: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

'''
Number of records: 1837928
Column names are: ['source', 'target', 'rationale', 'task', 'type']
'''

In [None]:
#all are CoT but being sure
dataset_cot = dataset.filter(lambda example: example['type'] == "CoT")
print(f'Number of records: {len(dataset_cot)}')
print(f'Column names are: {dataset_cot.column_names}')

In [None]:
def create_prompt(rec):

  start = "Read the Instruction below and provide an answer."
  question = f"### INSTRUCTION:\n{rec['source']}\n\n"
  response = f"### RESPONSE:\n{rec['rationale']}\n"
  answer = f"Therefore the answer is {rec['target']}\n\n"
  end = "### End"

  parts = [part for part in [start, question, response, answer, end] if part]

  formatted_prompt = "\n\n".join(parts)
  formatted_prompt = formatted_prompt.replace('\\n', '\n')

  rec["text"] = formatted_prompt

  return rec

In [None]:
p = create_prompt(dataset_cot[30000])
print(p)
print(p["text"])

In [None]:
dataset = dataset_cot.map(create_prompt)

In [None]:
dataset = dataset.map(
        batched=True,
        remove_columns=['source', 'target', 'rationale', 'task', 'type']
    )

In [None]:
print(dataset[30000]["text"])

In [None]:
#Save dataset to the hub for future use
#dataset.push_to_hub("Venkat-Ram-Rao/processed_cot_dataset", private=True)

In [None]:
#max length of the model
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [None]:
mx = get_max_length(model)
mx

In [None]:
len(dataset)

In [None]:
#tokenize dataset
dataset = dataset.map(lambda samples: tokenizer(samples['text']), batched=True)

In [None]:
len(dataset)

In [None]:
dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < mx)

In [None]:
len(dataset)

In [None]:
seed = 42
set_seed(seed)

dataset = dataset.shuffle(seed=seed)

***Freeze Original Weights***

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

***Create Lora Config***

In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
print(modules)

#['v_proj', 'up_proj', 'down_proj', 'k_proj', 'o_proj', 'q_proj', 'gate_proj']

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,  #attention heads
    lora_alpha=64,  #alpha scaling
    target_modules=modules,  #gonna train all
    lora_dropout=0.1,  # dropout probability for layers
    bias="none",
    task_type="CAUSAL_LM", #for Decoder models like GPT Seq2Seq for Encoder-Decoder models like T5
)



In [None]:
##Get the PEFT Model using the downloaded model and the loRA config
model = get_peft_model(model, config)

***Training***

In [None]:
# Print Trainable parameters
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
print(
    f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        max_steps=100, #20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs

trainer.train()

***Push to Hub***

In [None]:
model.push_to_hub("Venkat-Ram-Rao/Llama2_7B_qlora_CoT_FT-v2",
                  use_auth_token=True,
                  commit_message="fine tuned on kaist-ai/CoT-Collection",
                  private=True)

***Load from Hub***

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "Venkat-Ram-Rao/Llama2_7B_qlora_CoT_FT"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
mem = model.get_memory_footprint()
print("Memory footprint: {} ".format(mem))

#7,227,846,656

***Inference***

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
In this task, you are given an input list A. You need to find all the elements of the list that are numbers and calculate their sum.

['i', 'P', 'h', '849', 'e']



### RESPONSE:"""

In [None]:
batch = tokenizer(tst, return_tensors='pt')



In [None]:
with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

'''
EXPECTED ANSWER:

### RESPONSE:
The given list ['i', 'P', 'h', '849', 'e'] contains the number 849, which is the only element of the list that is a number. Therefore, the final answer is 849.

Therefore the answer is 849

### End'''

Test2

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
Sam is taller than Bob. Jack is taller than Sam. Is Jack taller than Bob?
Answer Yes or No



### RESPONSE:"""

batch = tokenizer(tst, return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
Sam is taller than Bob. Jack is taller than Sam.
Who is the tallest? Bob, Sam or Jack?




### RESPONSE:"""

batch = tokenizer(tst, return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
In this task, you are given an input list A. You need to find all the elements of the list that are numbers and calculate their sum.

['i', '100', 'h', '849', 'e']



### RESPONSE:"""

batch = tokenizer(tst, return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
In this task, you are given an input list A. You need to find all the elements of the list that are numbers and calculate their sum.

['i', '100', 'h', '849', '100']



### RESPONSE:"""

batch = tokenizer(tst, return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
In this task, you need to provide the correct option for a given problem on probability from the provided options. Problem: there is a 50 % chance jen will visit chile this year , while there is a 50 % chance that she will visit madagascar this year . what is the probability that jen will visit either chile or madagascar this year , but not both ?
Options: a ) 25.0 % , b ) 50.0 % , c ) 62.5 % , d ) 75.0 % , e ) 80.0 %



### RESPONSE:"""

batch = tokenizer(tst, return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
tst = """Read the Instruction below and provide an answer.

### INSTRUCTION:
In this task you will be given a list of numbers and you need to find the mean (average) of that list. The mean of a list can be found by summing every number in the list then dividing the result by the size of that list. The output should be rounded to 3 decimal places.

 [-43.959, 161.939]



### RESPONSE:"""

batch = tokenizer(tst, return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=90)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))