Initialization of necessary constants and paths.

In [1]:
import os
import torch
libcuda_path = "/usr/local/cuda/compat" #path to directory where libcuda.so resides
library_path = "/usr/local/cuda/lib64" #path to cuda library
dataset_storage_path = "/home/meta/nuva/unsloth/dataset_poc_p.json" #path to where the POC dataset is stored
output_dir = "/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4" #path to the output directory

#workaround for the need of root permissions when linking libcuda.so
os.environ["TRITON_LIBCUDA_PATH"]=libcuda_path
os.environ["LIBRARY_PATH"]=library_path
!echo $LD_LIBRARY_PATH
!echo $LIBRARY_PATH
import rouge_raw
eval = rouge_raw.RougeRaw()


/usr/local/nvidia/lib:/usr/local/nvidia/lib64
/usr/local/cuda/lib64

/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin


In [2]:

from unsloth import FastLanguageModel
max_seq_length = 4096*2 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
base_model_name = "/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-120"

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: NVIDIA A100 80GB PCIe MIG 2g.20gb. Max memory: 19.5 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


2024-04-02 01:50:20.169051: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 01:50:20.169175: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 01:50:20.171665: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-02 01:50:20.182578: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Unsloth 2024.3 patched 32 layers with 32 QKV layers, 

Uses dataset containing only page summaries.

In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    #instructions = examples["instruction"]
    instruction = "Summarize the following text:"
    inputs       = examples["text"]
    outputs      = examples["summary"]
    instructions = [instruction for _ in range(len(inputs))]
    texts = []
    for i, input in enumerate(inputs):
        inputs[i] = inputs[i].replace("-\n","").replace('\r', ' ').replace('\n', ' ')

    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)

    return { "instruction": instructions, "input": inputs, "output": outputs,  "text" : texts, }
pass

from datasets import load_dataset, DatasetDict
data_files = {"train":f"{dataset_storage_path}"}
dataset = load_dataset("json", data_files=data_files)
train_dataset = dataset["train"]
train_dataset = train_dataset.filter(lambda x: len(x["text"].split()) > 5)
train_dataset = train_dataset.map(formatting_prompts_func, batched = True).select_columns(["input", "output", "text", "instruction"]) #single train dataset

dataset = train_dataset.train_test_split(test_size=0.25, shuffle=False) #train test dataset 90/10

train_test_valid_dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test'],})
    #'dev': dataset['test']})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'text', 'instruction'],
        num_rows: 324
    })
    test: Dataset({
        features: ['input', 'output', 'text', 'instruction'],
        num_rows: 108
    })
})

In [4]:
#evaluate dataset using rouge_raw
from unsloth import FastLanguageModel
max_seq_length = 4096*2 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

import os
import json

def model_inference(model_path, text_to_summarize):
    print(f"Loading {model_path}...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    formatted_sample = alpaca_prompt.format(
            "Summarize the following text:", # instruction
            text_to_summarize, # input
            "", # output - leave this blank for generation!
    )
    inputs = tokenizer([formatted_sample], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 512)
    result = tokenizer.batch_decode(outputs)
    return result[0].split("### Response:")[1].split("</s>")[0].replace("\n","")

def push_model_to_hub(model_path, repo_path, hf_token):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    model.push_to_hub(repo_path, token = hf_token) # Online saving

def eval_dataset(model_path, test_dataset):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    
    articles = []
    for sample in train_test_valid_dataset["test"]["input"]:
        formatted_sample = alpaca_prompt.format(
            "Summarize the following text:", # instruction
            sample, # input
            "", # output - leave this blank for generation!
        )
        articles.append(formatted_sample)

    generated_outputs = []
    for i, article in enumerate(articles):
        print(f"{i}/{len(articles)}")
        #if(i == 0 or i == 1):
            #continue #test
        #print(article)
        inputs = tokenizer([article], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 512)
        result = tokenizer.batch_decode(outputs)
        for res in result:
            summary_res = res.split("### Response:")[1].split("</s>")[0].replace("\n","")
            #print(summary_res)
            generated_outputs.append(summary_res)
    
    with open(f"{model_path}/generated_outputs.json", "w") as file:
        json_gen_outputs = json.dumps(generated_outputs)
        file.write(json_gen_outputs)

    roguerawscore = eval.corpus(gold=test_dataset, system=generated_outputs)
    with open(f"{model_path}/rouge_scores.txt", "w") as file:
        # Write the ROUGE-1 scores
        file.write("ROUGE-1 F: " + str(roguerawscore["1"].f * 100) + "\n")
        file.write("ROUGE-1 P: " + str(roguerawscore["1"].p * 100) + "\n")
        file.write("ROUGE-1 R: " + str(roguerawscore["1"].r * 100) + "\n")

        # Write the ROUGE-2 scores
        file.write("ROUGE-2 F: " + str(roguerawscore["2"].f * 100) + "\n")
        file.write("ROUGE-2 P: " + str(roguerawscore["2"].p * 100) + "\n")
        file.write("ROUGE-2 R: " + str(roguerawscore["2"].r * 100) + "\n")

        # Write the ROUGE-L scores
        file.write("ROUGE-L F: " + str(roguerawscore["L"].f * 100) + "\n")
        file.write("ROUGE-L P: " + str(roguerawscore["L"].p * 100) + "\n")
        file.write("ROUGE-L R: " + str(roguerawscore["L"].r * 100) + "\n")
        
def evaluate_model_paths(paths, test_dataset):
    for i, path in enumerate(paths):
        print(f"evaluating: {path}")
        eval_dataset(path, test_dataset)
        print(f"evaluated {i} out of {len(paths)}")

        
def create_total_scores(checkpoint_paths):
    with open(f"{output_dir}/total_scores.txt", "w") as myfile:
        text_to_write = ""
        for checkpoint in checkpoint_paths:
            basename = os.path.basename(checkpoint)
            print(basename)
            rougescore_path = f"{checkpoint}/rouge_scores.txt"
            with open(rougescore_path) as file:
                text = file.read()
            text_to_write += f"{basename}\n{text}\n\n"
        myfile.write(text_to_write)
    
def print_rougeraw(score):
    print("ROUGE-1 F: ", score["1"].f*100)
    print("ROUGE-1 P: ", score["1"].p*100)
    print("ROUGE-1 R: ", score["1"].r*100)

    print("ROUGE-2 F: ", score["2"].f*100)
    print("ROUGE-2 P: ", score["2"].p*100)
    print("ROUGE-2 R: ", score["2"].r*100)

    print("ROUGE-L F: ", score["L"].f*100)
    print("ROUGE-L P: ", score["L"].p*100)
    print("ROUGE-L R: ", score["L"].r*100)
    

Evaluation of each checkpoint.

In [5]:
checkpoint_dirs = [f"{output_dir}/{dir}" for dir in os.listdir(output_dir) if dir.startswith("checkpoint")]
sorted_paths = sorted(checkpoint_dirs, key=lambda x: int(x.split('-')[-1]))
sorted_paths
evaluate_model_paths(sorted_paths, train_test_valid_dataset["test"]["output"]) #rouge scores saved in each checkpoint directory
create_total_scores(sorted_paths) #total scores saved in output directory

['/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-40',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-80',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-120',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-160',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-200',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-240',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-280',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-320',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-360',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-400',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-440',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/checkpoint-480',
 '/home/meta/nuva/unsloth/mistral_finetune_sc_poc_75_25_2e-4/check