In [None]:
# metrics
import json
import pandas as pd
import evaluate
import os
rouge_score = evaluate.load("rouge")
bleu_score = evaluate.load("bleu")
chrf_score = evaluate.load("chrf")
sacrebleu_score = evaluate.load("sacrebleu")

### function to calculate evaluations

In [None]:
def eval_metrics(predictions, references):
    # Ensure both predictions and references are lists of strings
    predictions = [str(pred) if pred is not None else "" for pred in predictions]
    references = [str(ref) if ref is not None else "" for ref in references]

    # Compute scores
    result = rouge_score.compute(predictions=predictions, references=references, rouge_types=["rouge1", "rouge2", "rougeL"])
    score = sacrebleu_score.compute(
            predictions=predictions,
            references=references
        )
    result["sacrebleu"] = score["score"]
    bleu = bleu_score.compute(predictions=predictions, references=references)
    result["bleu"] = bleu['bleu']
    chrf = chrf_score.compute(predictions=predictions, references=references) ##The higher the value, the better the translations
    chrf_plus = chrf_score.compute(predictions=predictions, references=references, word_order=2)  # chrF++
    result["chrf++"] = chrf_plus["score"]
    result["chrf"] = chrf["score"] #The higher the value, the better the translations
            
    return result

## Get the prediction results

In [None]:
# Set parameter for performing Evaluations
test_data = 'test_1'# test_2 / test_3
llm_model = 'llama_31_70b' # gpt/ mt5
batch_size = 25
path_f = 'italian2ladin' # You can edit
target_lang = ''

In [None]:
prefixes = [
    "Here are the translations of the 15 entries in the JSON format with the 'Italian' fields filled in:",
    "Here are the translations:",
]

In [None]:
def get_json_files(llm_model, test_data):
    # Get JSON files for specific translation test data using a specific llm
    # Define the file prefix file name
    file_prefix = (f'translation_{llm_model}_to_{target_lang}_ita2lad_size of_{batch_size}_batch_')

    # List all files in the directory that start with the specified prefix
    save_dir = 'save_results'
    matching_files = [f for f in os.listdir(f'{save_dir}/{path_f}') if f.startswith(file_prefix)] #current_path+'/save_results'
    # Count the number of matching files
    num_files = len(matching_files)
    print(f"Found {num_files} files.")

    scores = {}
    scores['rouge1'] = []
    scores['rouge2'] = []
    scores['rougeL'] = []
    scores['bleu'] = []
    scores['chrf'] = []
    scores['sacrebleu'] = []
    scores['chrf++'] = []
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bleu': [], 'chrf': [], 'sacrebleu': [], 'chrf++': []}
    all_scores = []
    batch_start = 0

    # Open real data / our ground truth
    # ----- Please Modify -----
    ref_data = pd.read_csv(f'dataset/{test_data}.csv') 
    
    for i in range(num_files):
        # Slicing for the current batch of data
        real_data = ref_data.iloc[batch_start:batch_start + batch_size]
        batch_start = (i + 1) * batch_size
        print(f"Processing batch {i+1}, starting at index {batch_start}")

        # Get the real data as a list
        ########### Set the Target Language #######################
        real_data = real_data[target_lang].tolist() # ladin / italian ## DOnt forget to set the target language
        ###########################################################

        # Open and read the JSON files of translation result
        file_loc=os.path.join(save_dir, path_f+f'/{file_prefix}{i}.json') #save_dir
        print("load the json file", file_loc)
        f = open(file_loc, encoding='utf8')
        data = json.load(f)

        # Get the target translation using llm API
        # if json data is in str, convert to dict
        if isinstance(data, str):
            data = json.loads(data)
        # Ensure 'choices' exists and contains data
        if "choices" in data and data["choices"]:
            translation_output = data["choices"][0].get("message", {}).get("content", "")
            #print(translation_output)
            if translation_output.strip():  # Check if translation output is not empty
                try:
                    if isinstance(translation_output, str):
                        translation_output = translation_output.strip('```json\n').strip('```')  # Clear unnecessary chars from GPT output

                    # Remove the additional response
                    for prefix in prefixes:
                        if translation_output.startswith(prefix):
                            translation_output = translation_output[len(prefix):].strip()
                        
                    
                    translation_output = json.loads(translation_output)  # Parse the JSON output
                    
                    prediction = []
                    # parsing the ladin and italian pair translation
                    for translation_pair in translation_output.get('translations', []):
                    # DOnt forget to set the target language
                    ################################################
                        result_translation = translation_pair.get(target_lang) 
                        prediction.append(result_translation) # 
                    ################################################

                    # Calculate the evaluation metrics
                    if len(prediction) == len(real_data):
                        scores =eval_metrics(prediction, real_data)
                    else:
                        print('the sentences do not match')         
                    # Append the pair to the combined_translations list
                    all_scores.append(
                                {'rouge1': scores['rouge1'],
                                'rouge2':scores['rouge2'],
                                'rougeL': scores['rougeL'],
                                'bleu': scores['bleu'],
                                'sacrebleu': scores['sacrebleu'],
                                'chrf': scores['chrf'],
                                'chrf++': scores['chrf++']}
                                )
                        
                except json.JSONDecodeError as ex:
                    #ex.with_traceback()
                    # Your escaped JSON string
                    print(f"Error parsing JSON content in file: {file_loc}")
                    print("length of characters is",len(translation_output))
                    print(f"Raw content causing the error:\n{translation_output}")
        else:
            print("No choices found in the response.")
    return all_scores


In [None]:
# Combine all translation results
translation_result=get_json_files(llm_model, test_data)
print(len(translation_result))

### Calculate the evaluation metrics

In [None]:
# Convert the list of dictionaries to a DataFrame
filtered_data = [entry for entry in translation_result if not all(isinstance(v, list) and len(v) == 0 for v in entry.values())]
fr = pd.DataFrame(filtered_data)
print(len(filtered_data))
# Calculate the mean for each column
mean_scores = fr.mean()
# Print the mean scores
print(mean_scores)
llm_model, test_data, path_f