In [1]:
!pip install bitsandbytes



In [2]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
import torch
from sklearn.model_selection import train_test_split

# Set the device to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             load_in_4bit=True,
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                             )
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

tokenizer.pad_token = "!" #Not EOS, will explain another time.\

CUTOFF_LEN = 256  #Our dataset has shot text
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[ "w1", "w2", "w3"],  #just targetting the MoE layers.
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [3]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

tokenizer.pad_token = "!" #Not EOS, will explain another time.\

CUTOFF_LEN = 256  #Our dataset has shot text
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[ "w1", "w2", "w3"],  #just targetting the MoE layers.
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [9]:
dataset = load_dataset("teknium/OpenHermes-2.5")

In [18]:

train_data = dataset['train']



In [19]:
train_data = train_data.shard(num_shards=10, index=0)
train_data

Dataset({
    features: ['views', 'idx', 'hash', 'source', 'model', 'model_name', 'custom_instruction', 'topic', 'conversations', 'skip_prompt_formatting', 'system_prompt', 'title', 'avatarUrl', 'id', 'category', 'language'],
    num_rows: 100156
})

In [7]:
def generate_prompt(user_query, sep="\n\n### "):
    sys_msg = "Take a look at the following instructions and try to follow them."

    # Extracting the conversation between human and GPT
    conversation = user_query['conversations']
    human_query = ""
    gpt_response = ""
    for item in conversation:
        if item['from'] == 'human':
            human_query = item['value']
        elif item['from'] == 'gpt':
            gpt_response = item['value']
        elif item['from'] == 'system':
          sys_msg = item['value']

    # Constructing the prompt
    prompt = "<s> [INST]" + sys_msg + "\n" + human_query + "[/INST]" + gpt_response + "</s>"
    return prompt
def tokenize(prompt):
    return tokenizer(
        prompt + tokenizer.eos_token,
        truncation=True,
        max_length=1024,
        padding="max_length"
    )


In [20]:
train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=['views', 'idx', 'hash', 'source', 'model', 'model_name', 'custom_instruction', 'topic', 'conversations', 'skip_prompt_formatting', 'system_prompt', 'title', 'avatarUrl', 'id', 'category', 'language'])


Map:   0%|          | 0/100156 [00:00<?, ? examples/s]

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=2,
        learning_rate=1e-2,
        logging_steps=2,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir="mixtral-moe-lora-instruct-experimental"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = True

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
2,2.0493
4,7.2357
6,23.8431
8,14.9987
10,13.9697
12,13.665
14,12.4349
16,13.0711
18,11.3566
20,10.7489


Step,Training Loss
2,2.0493
4,7.2357
6,23.8431
8,14.9987
10,13.9697
12,13.665
14,12.4349
16,13.0711
18,11.3566
20,10.7489


In [None]:
trainer.save_model("mixtral-moe-lora-instruct-experimental")