In [2]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline
)
from trl import SFTTrainer, setup_chat_format

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Load the Dataset

In [8]:
# Load the dataset
train_dataset = load_dataset("json", data_files="../data/train_dataset.json", split="train")
test_dataset = load_dataset("json", data_files="../data/test_dataset.json", split="train")

print("Train dataset size: ", len(train_dataset))
print("Test dataset size: ", len(test_dataset))

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train dataset size:  1245
Test dataset size:  312


In [9]:
print(train_dataset[0]["messages"])

[{'content': 'You are a professional machine learning conference reviewer who reviews a given paper and considers 4 criteria: **importance and novelty**, **potential reasons for acceptance**, **potential reasons for rejection**, and **suggestions for improvement**. The given paper is as follows.', 'role': 'system'}, {'content': "[TITLE]\nGraph Pooling by Edge Cut\n\n[ABSTRACT]\nGraph neural networks (GNNs) are very efficient at solving several tasks in graphs such as node classification or graph classification. They come from an adaptation of convolutional neural networks on images to graph structured data. These models are very effective at finding patterns in images that can discriminate images from each others. Another aspect leading to their success is their ability to uncover hierarchical structures. This comes from the pooling operation that produces different versions of the input image at different scales. The same way, we want to identify patterns at different scales in graphs

# Load the model

In [5]:
# os.environ['HUGGINGFACE_HUB_CACHE'] = "../model"
os.environ["TRANSFORMERS_CACHE"] = "../model"

In [41]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2" 

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
    cache_dir=os.getenv("TRANSFORMERS_CACHE"),
)
tokenizer = AutoTokenizer.from_pretrained(model_id,use_fast=True, cache_dir=os.getenv("TRANSFORMERS_CACHE"))
tokenizer.padding_side = 'right' # to prevent warnings

# We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [44]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [45]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

In [11]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="cis6200_academic_gpt"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

In [46]:
from transformers import TrainingArguments


# frankmorales2020/Mistral-7B-text-to-sql
args = TrainingArguments(
    output_dir="model/mistral_7b_academic",       # directory to save and repository id
    num_train_epochs=1,                           # number of training epochs
    per_device_train_batch_size=8,                # batch size per device during training
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    eval_steps=100,                         # evaluate every 100 steps
    evaluation_strategy="steps",            
    save_steps = 100,
    save_strategy="steps",
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    fp16=True,                              # use mixed precision training
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="wandb",                      # report metrics to wandb
    load_best_model_at_end=True,            # load best model at end of training
)

In [31]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
#!pip install huggingface_hub --quiet
from huggingface_hub import HfApi

api = HfApi()
api.get_token_permission(token='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')

repo_id = "travis0103/mistral_7b_academic"

In [47]:
from trl import SFTTrainer
# from accelerate import Accelerator

# accelerator = Accelerator()

max_seq_length = 8000

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

# acc_trainer = accelerator.prepare(trainer)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [51]:
torch.cuda.empty_cache()

: 

In [50]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.42 GiB. GPU 0 has a total capacity of 14.58 GiB of which 2.36 GiB is free. Including non-PyTorch memory, this process has 12.22 GiB memory in use. Of the allocated memory 9.78 GiB is allocated by PyTorch, and 2.32 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)