In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

dataset = load_dataset("pharaouk/CoT-Collection")

In [2]:

# using huggingface dataset combine the dataset["source"] and dataset["rationale"] into a new column called dataset["text"] with the <sep> token in between

def combine_text(row):
    source_tokenized = f"<Question>\n{row['source']}\n<Question />\n"
    rationale_tokenized = f"<Chain-of-Thought>\n{row['rationale']}\n<Chain-of-Thought />\n"
    target_tokenized = f"<Answer>\n{row['target']}\n<Answer />\n"
    return {"text": f"{source_tokenized} {rationale_tokenized} {target_tokenized}"}

dataset = dataset.map(combine_text, remove_columns=["source", "rationale", "target"])
print(len(dataset["train"]))

1837928


In [13]:
dataframe = dataset["train"].to_pandas()

train_dataset, test_dataset = train_test_split(dataframe, test_size=0.2, random_state=42)

# add train_dataset, test_dataset to a single hugginface dataset
from datasets import Dataset

# dataset_train = Dataset.from_pandas(train_dataset)
# dataset_test = Dataset.from_pandas(test_dataset)


In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)   

tokenizer.pad_token = "</s>"

def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=4096,
        add_special_tokens=False,
    )


In [17]:
import os
tokenized_train_dataset = dataset.map(
    tokenize, 
    batched=True, 
    num_proc=os.cpu_count(),    # multithreaded
    remove_columns=["text"]     # don't need this anymore, we have tokens from here on
)


Map (num_proc=20):   0%|          | 0/1837928 [00:00<?, ? examples/s]

In [28]:
# Init an eval tokenizer that doesn't add padding or eos token
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
)

eval_prompt = "Create a chain of thought, and attempted answer, to the question posed in the prompt"

model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(model.generate(**model_input, max_new_tokens=256, repetition_penalty=1.15)[0], skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Create a chain of thought, and attempted answer, to the question posed in the title.

I’m not sure if I have an answer for this one.  It seems like it would be easy enough to say that we should just do what is right, but then again, how many people actually know what is right?  And even if they did, how many people are willing to act on their convictions?  The world is full of people who claim to believe something, yet don’t live up to those beliefs.  They may say that they want peace, but they will go out and kill someone over a disagreement.  Or they may say that they love everyone, but they hate anyone who doesn’t agree with them.  So, I guess my answer would be that you need to find your own truths, and try to live by them as best you can.


In [29]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [22]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [23]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [24]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 85041152 || all params: 3837112320 || trainable%: 2.2162799758751914


In [25]:
from accelerate import Accelerator
# Set accelerator to use gpu
accelerator = Accelerator()
model = accelerator.prepare_model(model)

In [26]:
tokenized_val_dataset = tokenized_train_dataset["train"][:20_000]


In [27]:
import transformers
from datetime import datetime

project = "journal-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=2,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=500,
        learning_rate=2.5e-5, # Want a small lr for finetuning
        bf16=True,
        optim="paged_adamw_8bit",
        logging_steps=25,              # When to start reporting loss
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=25,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=25,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



RuntimeError: WandbCallback requires wandb to be installed. Run `pip install wandb`.