## Initialization of model, constants, paths etc.

In [5]:
import configparser

config = configparser.ConfigParser()
config.read('config.cfg')

dataset_storage_path = config['M7B-POC-FINETUNE']['dataset_storage_path'] #requires poc_p dataset
output_path_name = config['M7B-POC-FINETUNE']['output_path_name']
base_model_name = config['M7B-POC-FINETUNE']['base_model_name']
output_dir = config['M7B-POC-FINETUNE']['output_dir']
model_storage_path = config['M7B-POC-FINETUNE']['model_storage_path']
libcuda_path = config['Unsloth']['libcuda_path'] #path to directory where libcuda.so resides
library_path = config['Unsloth']['library_path'] #path to cuda library

import torch

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
print(torch.cuda.mem_get_info())

Number of GPUs available: 1
(47560982528, 47842000896)


In [6]:
import os
os.environ["TRITON_LIBCUDA_PATH"]=libcuda_path
os.environ["LIBRARY_PATH"]=library_path

/usr/local/nvidia/lib:/usr/local/nvidia/lib64
/usr/local/cuda/lib64

/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin


In [16]:

from unsloth import FastLanguageModel
max_seq_length = 4096*2 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: NVIDIA A40. Max memory: 44.556 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


2024-03-20 10:11:57.902458: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-20 10:11:57.902546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-20 10:11:57.904087: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-20 10:11:57.909188: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Unsloth 2024.3 patched 32 layers with 32 QKV layers, 

In [17]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    #instructions = examples["instruction"]
    instruction = "Summarize the following text:"
    inputs       = examples["text"]
    outputs      = examples["summary"]
    instructions = [instruction for _ in range(len(inputs))]
    texts = []
    for i, input in enumerate(inputs):
        inputs[i] = inputs[i].replace("-\n","").replace('\r', ' ').replace('\n', ' ')
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

    return { "instruction": instructions, "input": inputs, "output": outputs,  "text" : texts, }
pass

from datasets import load_dataset, DatasetDict
data_files = {"train":f"{dataset_storage_path}"}
dataset = load_dataset("json", data_files=data_files)
train_dataset = dataset["train"]
train_dataset = train_dataset.filter(lambda x: len(x["text"].split()) > 5)
train_dataset = train_dataset.map(formatting_prompts_func, batched = True).select_columns(["input", "output", "text", "instruction"]) #single train dataset

dataset = train_dataset.train_test_split(test_size=0.25, shuffle=False) #train test dataset 90/10


train_test_valid_dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test'],})
    #'dev': dataset['test']})

train_test_valid_dataset




DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'text', 'instruction'],
        num_rows: 324
    })
    test: Dataset({
        features: ['input', 'output', 'text', 'instruction'],
        num_rows: 108
    })
})

In [10]:
#print(train_test_valid_dataset["train"][0])
#train_test_valid_dataset["test"][0]["input"]
train_test_valid_dataset["test"]

Dataset({
    features: ['input', 'output', 'text', 'instruction'],
    num_rows: 108
})

### Training Settings

In [15]:
per_device_batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 16
total_steps = (len(dataset['train']) // (per_device_batch_size*gradient_accumulation_steps)) * num_train_epochs
total_steps

640

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_test_valid_dataset["train"],
    #eval_dataset = train_test_valid_dataset["test"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        #default
        per_device_train_batch_size = per_device_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = total_steps//10,
        num_train_epochs=num_train_epochs,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = total_steps//16,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 727,
        output_dir = output_dir,

        #save strategy
        save_strategy = "steps",
        save_steps = total_steps//16,
        save_total_limit=20
    ),
)


## Training and Saving the Model

In [None]:
trainer_stats = trainer.train()
model.save_pretrained(model_storage_path)
#model.push_to_hub("", token = "") # Online saving