In [None]:

import glob
import json
import numpy as np
from tqdm import tqdm
import stanza
import re

# Initialize stanza pipelines for tokenization
stanza.download('en')
stanza.download('zh')
nlp_en = stanza.Pipeline('en', processors='tokenize,pos')
nlp_zh = stanza.Pipeline('zh', processors='tokenize,pos')

def tokenize_text(text, is_zh=False):
    if is_zh:
        doc = nlp_zh(text)
    else:
        doc = nlp_en(text)
    return [word.text for sent in doc.sentences for word in sent.words]

def calculate_length_variety(source_sentence, target_sentence, source_is_zh=False, target_is_zh=False):
    len_source = len(tokenize_text(source_sentence, source_is_zh))
    len_target = len(tokenize_text(target_sentence, target_is_zh))
    length_variety = abs(len_source - len_target) / len_source if len_source > 0 else 0
    return length_variety

def calculate_global_lexical_density(corpus, is_zh=False):
    all_tokens = []
    content_words = []
    for text in corpus:
        if is_zh:
            doc = nlp_zh(text)
        else:
            doc = nlp_en(text)
        all_tokens.extend([word.text for sent in doc.sentences for word in sent.words])
        content_words.extend([word.text for sent in doc.sentences for word in sent.words 
                              if word.upos in ['NOUN', 'VERB', 'ADJ', 'ADV']])
    lexical_density = len(content_words) / len(all_tokens) if len(all_tokens) > 0 else 0
    return lexical_density


In [None]:

# Define the path to the data
path = "./llama_results"

# Initialize results dictionary
results = {}

# Process files in the given path
for file_path in glob.glob(f"{path}/*.json"):
    if "_0" in file_path:
        continue

    print(file_path.split("/")[-1])
    is_zh = "deen" not in file_path

    with open(file_path, "r") as f:
        data = json.load(f)

    # Extract source and target texts
    sources = [_extract_text(item["prompt"], is_zh=is_zh) for item in data]
    translations = [item["predict"] for item in data]

    # Calculate metrics
    lex_density = calculate_global_lexical_density(translations, is_zh=is_zh)
    len_variety = [
        calculate_length_variety(s, t, source_is_zh=False, target_is_zh=is_zh)
        for s, t in tqdm(zip(sources, translations))
    ]

    # Store results
    results[file_path.split("/")[-1]] = {
        "lex_density": lex_density,
        "len_variety": np.mean(len_variety),
    }

    print(f"Lexical Density: {lex_density}, Length Variety: {np.mean(len_variety)}")

# Output results
print(results)
