<a href="https://colab.research.google.com/github/vasugpt116/trainings/blob/main/Training_1_tinyllama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate peft bitsandbytes transformers trl torch



In [None]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

2.1.0+cu121


In [None]:
dataset = "sciq"
model_id= "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
output_model = "tinyllama-question"

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
from random import shuffle

def prepare_train_data(data_id):
  """Prepares the training data for TinyLlama with randomized options."""

  # Load the dataset and convert to pandas DataFrame
  data = load_dataset(data_id, split="train")
  data_df = data.to_pandas()

  # Select and keep necessary columns
  data_df = data_df[["question", "distractor1", "distractor2", "distractor3", "correct_answer", "support"]]

  # Randomize options (distractors + correct answer)
  for index, row in data_df.iterrows():
    options = [row["correct_answer"], row["distractor1"], row["distractor2"], row["distractor3"]]
    shuffle(options)
    data_df.loc[index, "options"] = "\n".join(options)

  # Create Dataset object with required features
  data = Dataset.from_pandas(data_df[["question", "options", "correct_answer", "support"]])

  # Add formatting for TinyLlama within examples
  data = data.map(
      lambda example: {"text": f"<is_start>user\n{example['question']}\n<is_end>\n<options>\n{example['options']}\n</options>\n<support>\n{example['support']}\n</support>\n<answer>{example['correct_answer']}</answer>",
        "correct_answer": example["correct_answer"],
        },
  )

  return data



In [None]:
data = prepare_train_data(dataset)

Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

In [None]:
data[0]

{'question': 'What type of organism is commonly used in preparation of foods such as cheese and yogurt?',
 'options': 'mesophilic organisms\nprotozoa\ngymnosperms\nviruses',
 'correct_answer': 'mesophilic organisms',
 'support': 'Mesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans or other animals. The optimal growth temperature of many pathogenic mesophiles is 37°C (98°F), the normal human body temperature. Mesophilic organisms have important uses in food preparation, including cheese, yogurt, beer and wine.',
 'text': '<is_start>user\nWhat type of organism is commonly used in preparation of foods such as cheese and yogurt?\n<is_end>\n<options>\nmesophilic organisms\nprotozoa\ngymnosperms\nviruses\n</options>\n<support>\nMesophiles grow best in moderate temperature, typically between 25°C and 40°C (77°F and 104°F). Mesophiles are often found living in or on the bodies of humans 

In [None]:
def get_model_and_tokenizer(mode_id):
    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
model, tokenizer = get_model_and_tokenizer(model_id)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
peft_config = LoraConfig(
    r = 8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type = "CAUSAL_LM"
)

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_model,
    per_device_train_batch_size=1,  # Lower accumulation steps (optional)
    optim="paged_adamw_32bit",
    gradient_accumulation_steps = 1,
    learning_rate=4e-5,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=3,
    max_steps=200,
    fp16=True,  # Mixed precision training
)

trainer = SFTTrainer(
  model=model,
  train_dataset=data,
  peft_config=peft_config,
  dataset_text_field="text",
  args=training_arguments,
  tokenizer=tokenizer,
  packing=False,
  max_seq_length=1024
    )


Map:   0%|          | 0/11679 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Step,Training Loss
10,2.3242
20,2.426
30,2.1267
40,2.3749
50,2.0196
60,2.2201
70,2.0625
80,1.8827
90,1.8257
100,1.8288


TrainOutput(global_step=200, training_loss=1.855234270095825, metrics={'train_runtime': 47.5082, 'train_samples_per_second': 4.21, 'train_steps_per_second': 4.21, 'total_flos': 210089683722240.0, 'train_loss': 1.855234270095825, 'epoch': 0.02})

In [None]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False, device_map="auto", trust_remote_code=True)
model_path = "/content/tinyllama-question/checkpoint-200"
peft_model = PeftModel.from_pretrained(model, model_path, from_transformer=True, device_map="auto")
model = peft_model.merge_and_unload()

In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [None]:
from transformer import Generation

# New Section