In [1]:
!pip install evaluate bitsandbytes

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: evaluate, bitsandbytes
Successfully installed bitsandbytes-0.45.1 evaluate-0.4.3


In [2]:
"""
***** Finetuning using a Trainer class from the Huggingface Transformers
***** library.
"""
# Imports
import numpy as np
import torch
import evaluate
import transformers
from transformers import AutoTokenizer, TrainingArguments, Seq2SeqTrainingArguments, Trainer, Seq2SeqTrainer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset

In [3]:
# Global variables
FILENAME = "/kaggle/input/jane-austens-works/ja1-train.json"
VALID_FILE = "/kaggle/input/jane-austens-works/ja1-valid.json"
DS = {"train":FILENAME, "valid":VALID_FILE}
MODEL = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
OUTPUT_DIR = "/kaggle/working/"
ML = 80
BS = 64
CHECK_DIR = "/kaggle/working/checkpoint-250"

In [4]:
dataset = load_dataset("json", data_files=DS)

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

In [5]:
# Initializing tokenizer and preprocessing input, etc.
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Bugfix for padding issues: https://discuss.huggingface.co/t/mistral-trouble-when-fine-tuning-dont-set-pad-token-id-eos-token-id/77928/8
tokenizer.add_special_tokens({'pad_token': '<pad>'})

def tokenize_function(examples): return tokenizer(examples["text"], 
                                                  padding="max_length", 
                                                  truncation=True,
                                                  max_length = ML,
                                                  return_tensors="pt")
full = dataset.map(tokenize_function, batched=True, batch_size=BS)

encoded_input = full["train"]
encoded_valid = full["valid"]

print("Preprocessed and tokenized data.")

Map:   0%|          | 0/66769 [00:00<?, ? examples/s]

Map:   0%|          | 0/13169 [00:00<?, ? examples/s]

Preprocessed and tokenized data.


In [6]:
lengths = {}
for i in range(10000):
    t = len(encoded_input[i]['input_ids'])
    if t in lengths:
        lengths[t] += 1
    else:
        lengths[t] = 1
print(lengths)

{80: 10000}


In [7]:
# Preparing for training and setting eval function
targs = Seq2SeqTrainingArguments(output_dir = OUTPUT_DIR,
                                 learning_rate = 2e-5,
                                 warmup_steps=2,
                                 gradient_accumulation_steps = 1,
                                 logging_dir=OUTPUT_DIR+"logs/",
                                 logging_steps = 10,
                                 per_device_train_batch_size=BS,
                                 #per_device_eval_batch_size=4,
                                 bf16=True,
                                 optim="paged_adamw_8bit",
                                 save_strategy = "steps",
                                 save_steps = 250,
                                 #eval_strategy = "steps",
                                 #eval_steps = 250,
                                 do_eval = False,
                                 report_to = "none",
                                 num_train_epochs = 1,
                                 use_cpu = False,
                                 log_level="debug",
                                 save_total_limit = 2,
                                )
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

print("Checkpoint.")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Checkpoint.


In [8]:
# Retrieving pretrained model
qc = BitsAndBytesConfig(load_in_4bit=True,
                        bnb_4bit_quant_type="nf4",
                        bnb_4bit_compute_dtype=torch.float16,
                       )
model = AutoModelForCausalLM.from_pretrained(MODEL, quantization_config = qc, device_map="auto")

# Bugfix for padding issues: https://discuss.huggingface.co/t/mistral-trouble-when-fine-tuning-dont-set-pad-token-id-eos-token-id/77928/8
model.resize_token_embeddings(len(tokenizer))

print("Retrieved model.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Retrieved model.


In [9]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

In [10]:
# Trainer object
trainer = Seq2SeqTrainer(
    model = model,
    args = targs,
    train_dataset = encoded_input,
    eval_dataset = encoded_valid,
    compute_metrics = compute_metrics,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

print("Created Trainer object.")

Using auto half precision backend


Created Trainer object.


In [11]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
model.config.use_cache = False

In [14]:
!rm -rf /kaggle/working/*
!cp -r "/kaggle/input/checkpoint-750/transformers/default/1" "/kaggle/working/checkpoint-750/"

In [15]:
trainer.train(resume_from_checkpoint = True)

Loading model from /kaggle/working/checkpoint-750.
Currently training with a batch size of: 64
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 66,769
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1,044
  Number of trainable parameters = 85,041,184
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 750
  Will skip the first 0 epochs then the first 750 batches in the first epoch.
  checkpoint_rng_state = torch.load(rng_file)
  return fn(*args, **kwargs)


Step,Training Loss
760,2.4044
770,2.4035
780,2.411
790,2.3893
800,2.422
810,2.4104
820,2.3633
830,2.3443
840,2.3815
850,2.4255


Saving model checkpoint to /kaggle/working/checkpoint-1000
  return fn(*args, **kwargs)
Saving model checkpoint to /kaggle/working/checkpoint-1044
Deleting older checkpoint [/kaggle/working/checkpoint-750] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1044, training_loss=0.6724419703428772, metrics={'train_runtime': 12211.1463, 'train_samples_per_second': 5.468, 'train_steps_per_second': 0.085, 'total_flos': 2.3337163993402368e+17, 'train_loss': 0.6724419703428772, 'epoch': 1.0})

In [16]:
import shutil
shutil.make_archive("checkpoint-1044", 'zip', "/kaggle/working/checkpoint-1044")

'/kaggle/working/checkpoint-1044.zip'