In [None]:
!pip3 install datasets transformers hf_transfer huggingface_hub[hf_xet] librosa -q
!pip3 install sacrebleu polars unbabel-comet -q
!pip3 install bitsandbytes accelerate -q

In [None]:
!rm -r model ymoslem

In [None]:
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [None]:
data_cache_dir = "/workspace/data/"
model_cache_dir = "/workspace/model/"

In [None]:
tgt_lang_code = "de"
# tgt_lang_code = "zh"
# tgt_lang_code = "ar"

# Load the dataset

In [None]:
from datasets import load_dataset, Audio

acl6060_all = load_dataset("ymoslem/ACL-6060",
                           split="dev+eval",
                           cache_dir=data_cache_dir
                          )
acl6060_all = acl6060_all.cast_column("audio", Audio(sampling_rate=16000))

acl6060 = acl6060_all.train_test_split(test_size=100, seed=0)

acl6060

In [None]:
print(acl6060["test"]["text_en"][0])
print(acl6060["test"]["text_de"][0])

# Laod the model

In [None]:
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
import torch

# model_name = "Qwen/Qwen2-Audio-7B"
model_name = "Qwen/Qwen2-Audio-7B-Instruct"


model = Qwen2AudioForConditionalGeneration.from_pretrained(model_name,
                                                           cache_dir=model_cache_dir,
                                                           # torch_dtype=torch.bfloat16,
                                                          ).to("cuda").eval()
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

print("Model loaded:", model_name)

In [None]:
assert model.device.type == "cuda"

In [None]:
print("Original config:", model.generation_config)
model.generation_config.do_sample = False
model.generation_config.temperature = None
model.generation_config.top_k = None
model.generation_config.top_p = None
print("Modified config:", model.generation_config)

In [None]:
def translate(audio_array, audio_path, sr, language, shot=0, model_type="instruct"):
    
    if model_type == "base":
        text = f"<|audio_bos|><|AUDIO|><|audio_eos|>Translate the English speech into {language}:"

    elif model_type == "instruct":
        if shot == 1:
            conversation = [
                {"role": "system", "content": f"You are a professional translator."},
                {"role": "user", "content": [
                    {"type": "text", "text": f"As knowledge base we use Wikipedia. \
                    Translate the English speech into {language}:"},
                ]},
                {"role": "assistant", "content": f"Als Wissensbasis verwenden wir Wikipedia."},
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": audio_path},  # just for formatting
                    {"type": "text", "text": f"Translate the English speech into {language}:"},
                ]},
            ]

        else:
            conversation = [
                {"role": "system", "content": f"You are a professional translator."},
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": audio_path},
                    {"type": "text", "text": f"Translate the English speech into {language}:"},
                ]},
            ]


        text = processor.apply_chat_template(conversation,
                                             add_generation_prompt=True,
                                             tokenize=False,
                                            )
        # print(text)


    inputs = processor(text=text,
                       audio=audio_array,
                       sampling_rate=sr,
                       return_tensors="pt").to("cuda")
    
    #####
    #first_input = inputs["input_ids"][0].tolist()
    #decoded_input = processor.tokenizer.decode(first_input, skip_special_tokens=False)
    #print(f"Decoded Input (First Row) :\n{decoded_input}")
    #####
    
    max_length = 1024
    generate_ids = model.generate(**inputs,
                                  max_length=max_length,
                                  do_sample=False,
                                  repetition_penalty=1.0,
                                  pad_token_id=processor.tokenizer.eos_token_id,
                                 )
    generate_ids = generate_ids[:, inputs.input_ids.size(1):]

    response = processor.batch_decode(generate_ids,
                                      skip_special_tokens=True,
                                      clean_up_tokenization_spaces=True)[0]

    return response.strip()

In [None]:
shot = 0

if model_name == "Qwen/Qwen2-Audio-7B":
    model_type = "base"
elif model_name == "Qwen/Qwen2-Audio-7B-Instruct":
    model_type = "instruct"
else:
    model_type = "instruct"


if tgt_lang_code == "de":
    language = "German"
elif tgt_lang_code == "zh":
    language = "Chinese"
elif tgt_lang_code == "ar":
    language = "Arabic"
else:
    raise ValueError(f"Unsupported target language code: {tgt_lang_code}")


print(f"{model_name=}\n{model_type=}\n{language=}")

In [None]:
from tqdm.auto import tqdm

translations = []

for segment in tqdm(acl6060["test"]):
    audios = [segment["audio"]["array"]]
    audio_path = segment["audio"]["path"]
    sr = segment["audio"]["sampling_rate"]
    
    translation = translate(audios, audio_path, sr, language, shot, model_type)
    translations.append(translation)

In [None]:
print(len(translations))

In [None]:
translations[:10]

# Evaluation

In [None]:
references = acl6060["test"][f"text_{tgt_lang_code}"]
source_sentences = acl6060["test"]["text_en"]

print(references[0])
print(source_sentences[0])
print(translations[0])

In [None]:
import sacrebleu

bleu_tokenizer = "zh" if tgt_lang_code == "zh" else None

# Calculate BLEU
bleu = sacrebleu.corpus_bleu(translations, [references], tokenize=bleu_tokenizer)  # tokenize="zh" for Chinese
bleu = round(bleu.score, 2)
print("BLEU:", bleu)

# Calculate ChrF++
chrf = sacrebleu.corpus_chrf(translations, [references], word_order=2)
chrf = round(chrf.score, 2)
print("ChrF++:", chrf)

In [None]:
chrf2 = sacrebleu.corpus_chrf(translations, [references])
chrf2 = round(chrf2.score, 2)
print("ChrF:", chrf2)

In [None]:
# Load COMET
from comet import download_model, load_from_checkpoint

model_path = download_model("wmt20-comet-da")
comet_model = load_from_checkpoint(model_path)

In [None]:
# Calculate COMET
import pandas as pd

df = pd.DataFrame({"src":source_sentences, "mt":translations, "ref":references})
data = df.to_dict('records')

seg_scores, sys_score = comet_model.predict(data, batch_size=128, gpus=1).values()
comet = round(sys_score*100, 2)
print("COMET:", comet)

In [None]:
import polars as pl

print(model_name)

df = pl.DataFrame({"BLEU": bleu,
                   "ChrF++": chrf,
                   "COMET": comet,
                   "ChrF": chrf2
                  }
                 )
df