## Initialization of model, constants, paths etc.
Initialization of necessary constants and paths.

In [1]:
import configparser

config = configparser.ConfigParser()
config.read('config.cfg')


model_summarize_path = config['M7B_POC_EVAL']['base_model_name']
libcuda_path = config['Unsloth']['libcuda_path'] #path to directory where libcuda.so resides
library_path = config['Unsloth']['library_path'] #path to cuda library
output_path = config['M7B_POC_EVAL']['eval_output_dir'] #"/home/jovyan/nuva/unsloth/mistral/ver3"

import os
os.environ["TRITON_LIBCUDA_PATH"]=libcuda_path
os.environ["LIBRARY_PATH"]=library_path

from unsloth import FastLanguageModel
import json
import torch
import nltk
import re
import rouge_raw
eval = rouge_raw.RougeRaw()

nltk.download('punkt')

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
print(torch.cuda.mem_get_info())


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Number of GPUs available: 1
(20791492608, 20937965568)


In [2]:
def load_model(model_path, max_seq_length = 32768): #
    max_seq_length = max_seq_length # Choose any! We auto support RoPE Scaling internally! (dont choose any or ALMA generates nonsense, for alma choose = 2048
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    return model, tokenizer

In [3]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100 80GB PCIe MIG 2g.20gb. Max memory = 19.5 GB.
0.0 GB of memory reserved.


In [4]:
def clean_text(text):
    return text.replace("-\n","").replace('\r', ' ').replace('\n', ' ')

def load_dataset(path):
    with open(path, 'r') as j:
         contents = json.loads(j.read())
    return contents

def remove_enumeration(text):
    pattern = r'^\s*\d+\.\s*'
    cleaned_lines = [re.sub(pattern, '', line) for line in text.split('\n')]
    return '\n'.join(cleaned_lines)           

In [23]:
def summarize_dataset(dataset_path, 
                      output_path, 
                      model, 
                      tokenizer,
                      prompt_func,
                      max_new_tokens=512, 
                      save_steps=-1, 
                      page_text_key="text", 
                      page_summary_key="summary", issue_summary_total_key="summary_total", overwrite = True):
    pages_processed = 0
    dataset = load_dataset(dataset_path)
    journals_processed = 0
    for key_journal, journal in dataset.items():
        print(f"Summarization: Processing journal {key_journal}, {journals_processed}/{len(dataset.items())}:") 
        journals_processed += 1
        for key_issue, issues in journal.items():
            print(f"Processing issue {key_issue}:") 
            issue_summary_total = []
            for key_page, pages in issues.items():
                
                if key_page.startswith("summary_total"):
                    continue
                for i, page in enumerate(pages): 
                    print(f"Processing {key_journal}, {key_issue}, page {i} out of {len(pages)}:")
                    if page_summary_key in pages[i] and overwrite is False:
                        continue
                    text_to_summarize = clean_text(pages[i][page_text_key])
                    if len(text_to_summarize.split()) > 10:
                        #print(prompt_func(text_to_summarize))
                        summarized_page = (summarize_text(prompt_func(text_to_summarize), model, tokenizer, max_new_tokens, temperature=1, top_p=1, do_sample=False, repetition_penalty = 1))
                    else:
                        summarized_page = ""
                    pages[i][page_summary_key] = summarized_page
                    print(summarized_page)
                    issue_summary_total.append(summarized_page)
                    pages_processed += 1
                    if save_steps > 0 and pages_processed % save_steps == 0:
                        filename = f"summarized_{os.path.splitext(os.path.basename(dataset_path))[0]}"
                        with open(f"{output_path}/{filename}.json", "w") as myfile:
                            print("Saving checkpoint")
                            myfile.write(json.dumps(dataset))
            if issue_summary_total_key in issues and overwrite is False:
                continue
            text_to_summarize = clean_text('\n'.join(issue_summary_total))
            summarized_issue = (summarize_text(prompt_func(text_to_summarize), model, tokenizer, max_new_tokens, temperature=0.6, top_p=0.8, do_sample=True, num_beams=1, top_k = 50, repetition_penalty = 1.1))
            print(summarized_issue)
            issues[issue_summary_total_key] = summarized_issue
    with open(f"{output_path}/{filename}.json", "w") as myfile:
        myfile.write(json.dumps(dataset))
        print(f"Finished summarizing. Saved to {output_path}/{filename}.json")
        
def summarize_text(prompt, model, tokenizer, max_new_tokens=512, temperature=1, top_p=1, do_sample=True, num_beams=1, top_k = 50, repetition_penalty = 1):
    max_seq_length = 8192
    inputs = tokenizer([prompt], max_length=max_seq_length , truncation=True, return_tensors = "pt").to("cuda")
    #inputs = tokenizer([articles[i]], truncation=True, max_length = max_seq_length, return_tensors = "pt").to("cuda")
    #print(articles[i])
    if inputs["input_ids"].size(1) >= max_seq_length:
        decoded = tokenizer.batch_decode(inputs["input_ids"])
        new_article = f"{decoded[0]}\n### Response:\n" 
        inputs = tokenizer([new_article], return_tensors = "pt").to("cuda")
        print(inputs["input_ids"].size(1))
    outputs = model.generate(**inputs, max_new_tokens = max_new_tokens, temperature = temperature, top_p = top_p, do_sample = do_sample, num_beams = num_beams, top_k = top_k, repetition_penalty = repetition_penalty)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    res_split = result[0].split("### Response:")
    return res_split[len(res_split)-1].replace("\n","")


def mistral_prompt(text):
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
    text = alpaca_prompt.format("Summarize the following text:", text, "")
    return text


In [14]:
def evaluate_dataset(dataset_path, 
                      page_gold_key = "summary", 
                      page_system_key ="summary_reference", issue_gold_key="summary_total", issue_reference_key = "summary_total_reference"):
    dataset = load_dataset(dataset_path)
    summary_total_gold = []
    summary_total_reference = []
    page_gold = []
    page_system = []
    for key_journal, journal in dataset.items():
        for key_issue, issues in journal.items():
            for key_page, pages in issues.items():
                if key_page.startswith("summary_total"):
                    continue
                for i, page in enumerate(pages): 
                    #print(f"Processing {key_journal}, {key_issue}, page {i} out of {len(pages)}:")
                    gold = pages[i][page_gold_key]
                    system = pages[i][page_system_key]
                    if not system or not system.strip():
                        continue
                    page_gold.append(gold)
                    page_system.append(system)
            summary_total_gold.append(issues[issue_gold_key])
            summary_total_reference.append(issues[issue_reference_key])
    summary_eval = eval.corpus(gold=page_gold, system=page_system)
    summary_total_eval = eval.corpus(gold=summary_total_gold, system=summary_total_reference)  
    return summary_eval, summary_total_eval

def print_rougeraw(score):
    print("ROUGE-1 F: ", score["1"].f*100)
    print("ROUGE-1 P: ", score["1"].p*100)
    print("ROUGE-1 R: ", score["1"].r*100)

    print("ROUGE-2 F: ", score["2"].f*100)
    print("ROUGE-2 P: ", score["2"].p*100)
    print("ROUGE-2 R: ", score["2"].r*100)

    print("ROUGE-L F: ", score["L"].f*100)
    print("ROUGE-L P: ", score["L"].p*100)
    print("ROUGE-L R: ", score["L"].r*100)
    
def write_rougeraw_to_file(score, filename):
    with open(filename, 'w') as file:
        file.write("ROUGE-1 F: " + str(score["1"].f*100) + "\n")
        file.write("ROUGE-1 P: " + str(score["1"].p*100) + "\n")
        file.write("ROUGE-1 R: " + str(score["1"].r*100) + "\n")

        file.write("ROUGE-2 F: " + str(score["2"].f*100) + "\n")
        file.write("ROUGE-2 P: " + str(score["2"].p*100) + "\n")
        file.write("ROUGE-2 R: " + str(score["2"].r*100) + "\n")

        file.write("ROUGE-L F: " + str(score["L"].f*100) + "\n")
        file.write("ROUGE-L P: " + str(score["L"].p*100) + "\n")
        file.write("ROUGE-L R: " + str(score["L"].r*100) + "\n")

Load the model that will summarize the dataset. Reminder that Unsloth does not support all models available on Hugging Face.

In [7]:
model, tokenizer = load_model(model_summarize_path)

==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: NVIDIA A100 80GB PCIe MIG 2g.20gb. Max memory: 19.5 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


2024-04-02 08:41:56.710157: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 08:41:56.710280: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 08:41:56.712881: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-02 08:41:56.724502: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Unsloth 2024.3 patched 32 layers with 32 QKV layers, 

## Evaluate on entire POC (even the summaries the model has been trained on)

In [None]:
summarize_dataset(f"{output_path}/dataset.json", output_path, model, tokenizer, mistral_prompt, save_steps=5, page_summary_key="summary_mistral_poc", issue_summary_total_key="summary_total_mistral_poc", overwrite = True)

In [None]:
eval_results = evaluate_dataset(f"{output_path}/summarized_dataset.json", page_system_key ="summary_mistral_poc", issue_reference_key="summary_total_mistral_poc")
print("summary")
write_rougeraw_to_file(eval_results[0], f"{output_path}/score_summary.txt")
print_rougeraw(eval_results[0])
print("summary total")
write_rougeraw_to_file(eval_results[1], f"{output_path}/score_summary_total.txt")
print_rougeraw(eval_results[1])

## Evaluate on POC test set (106 page summaries, 25 issue summaries)

In [None]:
summarize_dataset(f"{output_path}/dataset_poc_106.json", output_path, model, tokenizer, mistral_prompt, save_steps=5, page_summary_key="summary_mistral_poc", issue_summary_total_key="summary_total_mistral_poc", overwrite = True)

In [15]:
eval_results = evaluate_dataset(f"{output_path}/summarized_dataset_poc_106.json", page_system_key ="summary_mistral_poc", issue_reference_key="summary_total_mistral_poc")
print("summary")
write_rougeraw_to_file(eval_results[0], f"{output_path}/score_summary_106.txt")
print_rougeraw(eval_results[0])
print("summary total")
write_rougeraw_to_file(eval_results[1], f"{output_path}/score_summary_total_106.txt")
print_rougeraw(eval_results[1])

summary
ROUGE-1 F:  19.599579570871565
ROUGE-1 P:  23.513918267142934
ROUGE-1 R:  17.38431438069711
ROUGE-2 F:  3.9690679348681015
ROUGE-2 P:  4.764066321875574
ROUGE-2 R:  3.4995058932600855
ROUGE-L F:  13.81292002879949
ROUGE-L P:  16.566196019881907
ROUGE-L R:  12.24982428608036
summary total
ROUGE-1 F:  16.596355391327243
ROUGE-1 P:  17.466067926707165
ROUGE-1 R:  16.567176445351826
ROUGE-2 F:  2.4480608499240892
ROUGE-2 P:  2.5795150330036583
ROUGE-2 R:  2.4281210485098663
ROUGE-L F:  11.751708190298409
ROUGE-L P:  12.404226708262984
ROUGE-L R:  11.78423937614917
