## Initialization of model, constants, paths etc.

In [1]:
import configparser
import torch

config = configparser.ConfigParser()
config.read('config.cfg')

dataset_storage_path = config['Mistral-SC']['dataset_storage_path']
model_storage_path = config['Mistral-SC']['model_storage_path']
output_dir = config['Mistral-SC']['output_dir']
libcuda_path = config['Unsloth']['libcuda_path']
library_path = config['Unsloth']['library_path']

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
print(torch.cuda.mem_get_info())

Number of GPUs available: 1
(3957129216, 47842000896)


In [2]:
import os
os.environ["TRITON_LIBCUDA_PATH"]=libcuda_path
os.environ["LIBRARY_PATH"]=library_path

/usr/local/nvidia/lib:/usr/local/nvidia/lib64
/usr/local/cuda/lib64

/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin


In [4]:

from unsloth import FastLanguageModel
max_seq_length = 4096*2 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth: Fast Mistral patching release 2024.2
   \\   /|    GPU: NVIDIA A40. Max memory: 44.556 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


2024-03-04 08:31:03.064515: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-04 08:31:03.064590: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-04 08:31:03.066231: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-04 08:31:03.073523: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Unsloth 2024.2 patched 32 layers with 32 QKV layers, 

In [5]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    #instructions = examples["instruction"]
    instruction = "Summarize the following text:"
    inputs       = examples["text"]
    outputs      = examples["abstract"]
    instructions = [instruction for _ in range(len(inputs))]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "instruction": instructions, "input": inputs, "output": outputs,  "text" : texts, }
pass

from datasets import load_dataset
data_files = {"train":f"{dataset_storage_path}/sumeczech-1.0-train.jsonl",
                "test":f"{dataset_storage_path}/sumeczech-1.0-test.jsonl",
                "validation":f"{dataset_storage_path}/sumeczech-1.0-dev.jsonl"}
dataset = load_dataset("json", data_files=data_files)
train_dataset = dataset["train"]
#test_dataset = dataset.split("test")
validation_dataset = dataset["validation"]
train_dataset = train_dataset.map(formatting_prompts_func, batched = True).select_columns(["input", "output", "text", "instruction"])
validation_dataset = validation_dataset.map(formatting_prompts_func, batched = True).select_columns(["input", "output", "text", "instruction"])


In [6]:
print(validation_dataset[0])

{'input': 'Kdy jste slyšela jako cizinka slovo Ostrava poprvé?\nO Ostravě jsem poprvé slyšela v Českých Budějovicích, byla jsem tam asi rok v angažmá. Studovali jsme Čajkovského balet Romeo a Julie v choreografii Libuše Králové, tehdejší šéfky ostravského baletu. A tam paní Králová nabídla mně a mému tanečnímu partnerovi Rodionu Zelenkovovi možnost konkurzu na sólový pár do Ostravy. Konkurz jsme udělali a nastoupili do Ostravy jako solisté baletu.\nPocházíte z Ruska…\nNarodila jsem se v ruském Uljanovsku, vyrůstala na Ukrajině, kde měl tatínek práci, a už na Ukrajině s maminkou zůstali. Sama jsem se přihlásila v sedmi letech do baletní školy, tanec jsem milovala odmalička, pořád jsem před zrcadlem pózovala a tančila. Moje maminka a tatínek také tančili, ale pak změnili povolání. Jednou do našeho městečka přijel kyjevský klasický balet. Bydleli jsme v centru města a ve velkém domě kultury se ta představení odehrávala. Okna šaten byla na straně našeho dvorku pro děti ,a my děti koukali d

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        #eval
        fp16_full_eval = True,
        per_device_eval_batch_size = 2,
        eval_accumulation_steps = 4,
        evaluation_strategy = "steps",
        eval_steps = 27100,

        #default
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 100,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 200,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = output_dir,

        #save strategy
        save_strategy = "steps",
        save_steps = 200,
        save_total_limit=20
    ),
)


## Training and Saving

In [None]:
trainer_stats = trainer.train(resume_from_checkpoint = True) # remove resume_from_checkpoint to start from scratch
model.save_pretrained(model_storage_path)