## Initialization of model, constants, paths etc.
Initialization of necessary constants and paths.

In [18]:
import configparser

config = configparser.ConfigParser()
config.read('config.cfg')

model_translate_path = config['TST']['model_translate_path']
model_summarize_path = config['TST']['model_summarize_path']
libcuda_path = config['Unsloth']['libcuda_path']
library_path = config['Unsloth']['library_path']
dataset_path = config['TST']['dataset_path']
output_path = config['TST']['output_path']

import os
os.environ["TRITON_LIBCUDA_PATH"]=libcuda_path
os.environ["LIBRARY_PATH"]=library_path

from unsloth import FastLanguageModel
import json
import torch
import nltk
import re
import rouge_raw
eval = rouge_raw.RougeRaw()

nltk.download('punkt')

num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
print(torch.cuda.mem_get_info())


Number of GPUs available: 1
(7534739456, 47842000896)


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
def load_model(model_path, max_seq_length = 32768): #
    max_seq_length = max_seq_length # Choose any! We auto support RoPE Scaling internally! (dont choose any or ALMA generates nonsense, for alma choose = 2048
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    return model, tokenizer

In [20]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A40. Max memory = 44.556 GB.
37.217 GB of memory reserved.


In [21]:
def alma_prompt(texts, src_lang="Czech", out_lang="English"):
    formatted_texts = texts.copy()
    for i, text in enumerate(texts):
        formatted_texts[i] = f"Translate this from {src_lang} to {out_lang}:\n{src_lang}: {text}\n{out_lang}:"
    return formatted_texts

def clean_text(text):
    return text.replace("-\n","").replace('\r', ' ').replace('\n', ' ')

def load_dataset(path):
    with open(path, 'r') as j:
         contents = json.loads(j.read())
    return contents

def chunk_sentences(text, n, lang):
    # Initialize an empty list to store the joined sentences
    sentence_split = nltk.sent_tokenize(text=text, language=lang)
    joined_sentences = []
    
    # Iterate over the sentences list with a step of n
    for i in range(0, len(sentence_split), n):
        # Join the current and the next n-1 sentences and append to the list
        joined_sentences.append(' '.join(sentence_split[i:i+n]))
    
    return joined_sentences

def translate_text(prompt, model, tokenizer, max_new_tokens=2048, temperature=1, top_p=1, repetition_penalty = 1.3):
    inputs = tokenizer(prompt, return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = max_new_tokens)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, temperature=temperature, top_p=top_p, repetition_penalty = repetition_penalty)
    #print(decoded_outputs)
    return decoded_outputs[0].split(prompt)[1]

def translate_article(text, model, tokenizer, prompt_func, src_lang, out_lang, max_new_tokens=2048, chunk_size=10):
    text = clean_text(text)
    chunked_sentences = chunk_sentences(text, chunk_size, "czech")
    formatted_chunked_sentences = prompt_func(chunked_sentences, src_lang, out_lang)
    #print(formatted_chunked_sentences)
    translated_article = []
    for i, chunk_sentence in enumerate(formatted_chunked_sentences):
        #print(f"{i}/{len(formatted_chunked_sentences)}")
        translated_text = translate_text(chunk_sentence, model, tokenizer, max_new_tokens)
        translated_article.append(translated_text)
    return ''.join(translated_article)

def translate_dataset(dataset_path, 
                      output_path, 
                      model, 
                      tokenizer,
                      prompt_func,
                      max_new_tokens=2048, 
                      save_steps=-1, 
                      src_lang="Czech", 
                      out_lang="English", 
                      page_key="text", 
                      page_key_translated="text_translated", issue_key=None, issue_key_translated=None):
    pages_processed = 0
    dataset = load_dataset(dataset_path)
    journals_processed = 0
    for key_journal, journal in dataset.items():
        print(f"Processing journal {key_journal}, {journals_processed}/{len(dataset.items())}:") 
        journals_processed += 1
        for key_issue, issues in journal.items():
            print(f"Processing issue {key_issue}:") 
            for key_page, pages in issues.items():
                if key_page.startswith("summary_total"):
                    continue
                for i, page in enumerate(pages): 
                    print(f"Processing {key_journal}, {key_issue}, page {i} out of {len(pages)}:")
                    text_to_translate = pages[i][page_key]
                    translated_page = translate_article(text_to_translate, model, tokenizer, prompt_func, src_lang, out_lang, max_new_tokens)
                    print(translated_page)
                    pages[i][page_key_translated] = translated_page
                    pages_processed += 1
                    if save_steps > 0 and pages_processed % save_steps == 0:
                        filename = f"translated_{os.path.splitext(os.path.basename(dataset_path))[0]}"
                        with open(f"{output_path}/{filename}.json", "w") as myfile:
                            print("Saving checkpoint")
                            myfile.write(json.dumps(dataset))
            if issue_key is not None and issue_key_translated is not None:
                text_to_translate = issues[issue_key]
                translated_page = translate_article(text_to_translate, model, tokenizer, prompt_func, src_lang, out_lang, max_new_tokens)
                issues[issue_key_translated] = translated_page
                
    with open(f"{output_path}/{filename}.json", "w") as myfile:
        myfile.write(json.dumps(dataset))
        print(f"Finished translating. Saved to {output_path}/{filename}.json")

                        

In [25]:
def remove_enumeration(text):
    pattern = r'^\s*\d+\.\s*'
    cleaned_lines = [re.sub(pattern, '', line) for line in text.split('\n')]
    return '\n'.join(cleaned_lines)

def summarize_dataset(dataset_path, 
                      output_path, 
                      model, 
                      tokenizer,
                      prompt_func,
                      max_new_tokens=512, 
                      save_steps=-1, 
                      src_lang="Czech", 
                      out_lang="English", 
                      page_key="text_translated", 
                      page_key_summary="summary_translated"):
    pages_processed = 0
    dataset = load_dataset(dataset_path)
    journals_processed = 0
    for key_journal, journal in dataset.items():
        print(f"Summarization: Processing journal {key_journal}, {journals_processed}/{len(dataset.items())}:") 
        journals_processed += 1
        for key_issue, issues in journal.items():
            print(f"Processing issue {key_issue}:") 
            issue_summary_total = []
            for key_page, pages in issues.items():
                
                if key_page.startswith("summary_total"):
                    continue
                for i, page in enumerate(pages): 
                    print(f"Processing {key_journal}, {key_issue}, page {i} out of {len(pages)}:")
                    text_to_summarize = clean_text(pages[i][page_key])
                    if len(text_to_summarize.split()) > 10:
                        summarized_page = remove_enumeration(summarize_text(prompt_func(text_to_summarize), model, tokenizer, max_new_tokens))
                    else:
                        summarized_page = " "
                    pages[i][page_key_summary] = summarized_page
                    print(summarized_page)
                    issue_summary_total.append(summarized_page)
                    pages_processed += 1
                    if save_steps > 0 and pages_processed % save_steps == 0:
                        filename = f"summarized_{os.path.splitext(os.path.basename(dataset_path))[0]}"
                        with open(f"{output_path}/{filename}.json", "w") as myfile:
                            print("Saving checkpoint")
                            myfile.write(json.dumps(dataset))
            text_to_summarize = clean_text('\n'.join(issue_summary_total))
            summarized_issue = remove_enumeration(summarize_text(prompt_func(text_to_summarize), model, tokenizer, max_new_tokens))
            print(summarized_issue)
            issues["summary_total_translated"] = summarized_issue
    with open(f"{output_path}/{filename}.json", "w") as myfile:
        myfile.write(json.dumps(dataset))
        print(f"Finished summarizing. Saved to {output_path}/{filename}.json")
        
def summarize_text(prompt, model, tokenizer, max_new_tokens=512, temperature=0.3, top_p=1):
    #print(tokenizer.apply_chat_template(prompt, tokenize=False, return_tensors = "pt").to("cuda"))
    inputs = tokenizer.apply_chat_template(prompt, return_tensors = "pt").to("cuda")
    outputs = model.generate(inputs, max_new_tokens = max_new_tokens)
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, temperature=temperature, top_p=top_p)
    #print(decoded_outputs[0].split(prompt))
    split = decoded_outputs[0].split("[/INST]")
    return split[len(split)-1]

def mistral_prompt(text):
    text = f"Summarize the following text in five sentences: {text}"
    messages = [
        {"role": "user", "content": f"Summarize my texts using only 5 sentences"},
        {"role": "assistant", "content": f"Sure. I will write summaries in the style of a news reporter and use only 5 sentences."},
        {"role": "user", "content": f"{text}"},
    ]
    return messages


In [23]:
def evaluate_dataset(dataset_path, 
                      page_gold_key = "summary", 
                      page_system_key ="summary_reference", issue_gold_key="summary_total", issue_reference_key = "summary_total_reference"):
    dataset = load_dataset(dataset_path)
    summary_total_gold = []
    summary_total_reference = []
    page_gold = []
    page_system = []
    for key_journal, journal in dataset.items():
        for key_issue, issues in journal.items():
            for key_page, pages in issues.items():
                if key_page.startswith("summary_total"):
                    continue
                for i, page in enumerate(pages): 
                    #print(f"Processing {key_journal}, {key_issue}, page {i} out of {len(pages)}:")
                    gold = pages[i][page_gold_key]
                    system = pages[i][page_system_key]
                    page_gold.append(gold)
                    page_system.append(system)
            summary_total_gold.append(issues[issue_gold_key])
            summary_total_reference.append(issues["summary_total_reference"])
    summary_eval = eval.corpus(gold=page_gold, system=page_system)
    summary_total_eval = eval.corpus(gold=summary_total_gold, system=summary_total_reference)
    return summary_eval, summary_total_eval

def prefix_filename(path, prefix):
    dir_name, file_name = os.path.split(path)
    prefixed_file_name = prefix + file_name
    return os.path.join(dir_name, prefixed_file_name)

def print_rougeraw(score):
    print("ROUGE-1 F: ", score["1"].f*100)
    print("ROUGE-1 P: ", score["1"].p*100)
    print("ROUGE-1 R: ", score["1"].r*100)

    print("ROUGE-2 F: ", score["2"].f*100)
    print("ROUGE-2 P: ", score["2"].p*100)
    print("ROUGE-2 R: ", score["2"].r*100)

    print("ROUGE-L F: ", score["L"].f*100)
    print("ROUGE-L P: ", score["L"].p*100)
    print("ROUGE-L R: ", score["L"].r*100)

Load translation model

In [24]:
model, tokenizer = load_model(model_translate_path, max_seq_length=2048)

==((====))==  Unsloth: Fast Llama patching release 2024.3
   \\   /|    GPU: NVIDIA A40. Max memory: 44.556 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some weights of the model checkpoint at haoranxu/ALMA-13B-Pretrain were not used when initializing LlamaForCausalLM: ['model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'mo

## TST
Translate POC dataset to English. Creates a copy of the dataset with prefix "translated_" with translated text to the set output_path.

In [None]:
#Define filenames
translated_dataset_path = prefix_filename(dataset_path, "translated_")
summarized_translated_dataset_path = prefix_filename(translated_dataset_path, "summarized_")
translated_summarized_translated_dataset_path = prefix_filename(summarized_translated_dataset_path, "translated_")

In [None]:
translate_dataset(dataset_path, 
                  output_path, 
                  model, 
                  tokenizer, 
                  alma_prompt, 
                  save_steps=5, 
                  src_lang="Czech", 
                  out_lang="English", 
                  page_key="text", page_key_translated="text_translated")

Load English summarization model

In [None]:
model, tokenizer = load_model(model_summarize_path)

Summarize English text. Creates a copy of the dataset with prefix "summarized_" with summarized text to the set output_path.

In [None]:
summarize_dataset(translated_dataset_path, output_path, model, tokenizer, mistral_prompt, save_steps=5)

Load translation model

In [None]:
model, tokenizer = load_model(model_translate_path, max_seq_length=2048)

Translate English summaries to Czech

In [None]:

translate_dataset(summarized_translated_dataset_path, 
                  output_path, 
                  model, 
                  tokenizer, 
                  alma_prompt, 
                  save_steps=5, 
                  src_lang="English", 
                  out_lang="Czech", 
                  page_key="summary_translated", page_key_translated="summary_reference", issue_key="summary_total_translated", issue_key_translated="summary_total_reference")

## Evaluation of TST
Evaluate the summaries

In [42]:
scores = evaluate_dataset(translated_summarized_translated_dataset_path)


In [None]:
print("page")
print_rougeraw(scores[0])
print("total")
print_rougeraw(scores[1])
