In [3]:

import glob
import json
import nltk
import numpy as np
from tqdm import tqdm
import stanza
import re

# Initialize stanza pipelines for tokenization
stanza.download('en')
stanza.download('zh')
nlp_en = stanza.Pipeline('en', processors='tokenize,pos')
nlp_zh = stanza.Pipeline('zh', processors='tokenize,pos')



def tokenize_text(text, is_zh=False):
    def tokenize_chinese(text):
        # char-level tokenize
        # Regular expression to match Chinese characters, English words, and numbers
        pattern = re.compile(r'[\u4e00-\u9fff]|[a-zA-Z]+|\d+')
        # Find all matches of Chinese characters, English words, and numbers
        tokens = pattern.findall(text)
        return tokens
    if is_zh:
        return tokenize_chinese(text)
    else:

        return nltk.word_tokenize(text)
    


def calculate_length_variety(source_sentence, target_sentence, source_is_zh=False, target_is_zh=False):
    len_source = len(tokenize_text(source_sentence, source_is_zh))
    len_target = len(tokenize_text(target_sentence, target_is_zh))
    length_variety = abs(len_source - len_target) / len_source if len_source > 0 else 0
    return length_variety

def calculate_global_lexical_density(corpus, is_zh=False):
    all_tokens = []
    content_words = []
    for text in corpus:
        if is_zh:
            doc = nlp_zh(text)
        else:
            doc = nlp_en(text)
        all_tokens.extend([word.text for sent in doc.sentences for word in sent.words])
        content_words.extend([word.text for sent in doc.sentences for word in sent.words 
                              if word.upos in ['NOUN', 'VERB', 'ADJ', 'ADV']])
    lexical_density = len(content_words) / len(all_tokens) if len(all_tokens) > 0 else 0
    return lexical_density

def _extract_text(src, is_zh=False):
    sep = "Chinese" if is_zh else "English"
    src = src.split("assistant")[0].split(f"{sep}:\n")[1]

    return src

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 5.89MB/s]                    
2025-01-26 11:02:22 INFO: Downloaded file to /Users/haroldl/stanza_resources/resources.json
2025-01-26 11:02:22 INFO: Downloading default packages for language: en (English) ...
2025-01-26 11:02:24 INFO: File exists: /Users/haroldl/stanza_resources/en/default.zip
2025-01-26 11:02:26 INFO: Finished downloading models and saved to /Users/haroldl/stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 6.44MB/s]                    
2025-01-26 11:02:26 INFO: Downloaded file to /Users/haroldl/stanza_resources/resources.json
2025-01-26 11:02:26 INFO: "zh" is an alias for "zh-hans"
2025-01-26 11:02:26 INFO: Downloading default packages for language: zh-hans (Simplified_Chinese) ...
2025-01-26 11:02:28 INFO: File exists: /Users/haroldl/stanza_resources/zh-hans/default.zip
202

In [4]:

# Define the path to the data
path = "./llama_results"

# Initialize results dictionary
results = {}

# Process files in the given path
for file_path in glob.glob(f"{path}/*.json"):
    if "_0" in file_path:
        continue

    print(file_path.split("/")[-1])
    is_zh = "deen" not in file_path

    with open(file_path, "r") as f:
        data = json.load(f)

    # Extract source and target texts
    sources = [_extract_text(item["prompt"], is_zh=is_zh) for item in data]
    translations = [item["predict"] for item in data]

    # Calculate metrics
    lex_density = calculate_global_lexical_density(translations, is_zh=is_zh)
    len_variety = [
        calculate_length_variety(s, t, source_is_zh=False, target_is_zh=is_zh)
        for s, t in tqdm(zip(sources, translations))
    ]

    # Store results
    results[file_path.split("/")[-1]] = {
        "lex_density": lex_density,
        "len_variety": np.mean(len_variety),
    }

    print(f"Lexical Density: {lex_density}, Length Variety: {np.mean(len_variety)}")

# Output results
print(results)


merge_pol_checkpoint-3500_predict_wmt_generated_predictions.jsonl.ppl.std.json


2037it [00:00, 19005.94it/s]


Lexical Density: 0, Length Variety: 0.4660532085327173
merge_trans_checkpoint-3500_predict_deen_wmt_generated_predictions.jsonl.ppl.std.json


1984it [00:00, 11384.83it/s]


Lexical Density: 0, Length Variety: 0.15308827343544776
merge_pol_checkpoint-3500_predict_generated_predictions.jsonl.ppl.std.json


200it [00:00, 2212.14it/s]


Lexical Density: 0, Length Variety: 0.7173388985989072
merge_trans_checkpoint-3500_predict_generated_predictions.jsonl.ppl.std.json


200it [00:00, 2305.39it/s]


Lexical Density: 0, Length Variety: 0.648249274780602
merge_pol_checkpoint-3500_predict_deen_wmt_generated_predictions.jsonl.ppl.std.json


1984it [00:00, 10872.38it/s]


Lexical Density: 0, Length Variety: 0.16479223973499815
merge_raw_checkpoint-3500_predict_deen_generated_predictions.jsonl.ppl.std.json


100it [00:00, 1389.99it/s]


Lexical Density: 0, Length Variety: 0.07860129601380215
merge_raw_checkpoint-3500_predict_deen_wmt_generated_predictions.jsonl.ppl.std.json


1984it [00:00, 11592.51it/s]


Lexical Density: 0, Length Variety: 0.15028778091236628
merge_raw_checkpoint-3500_predict_generated_predictions.jsonl.ppl.std.json


200it [00:00, 2435.31it/s]


Lexical Density: 0, Length Variety: 0.6389755572310938
merge_raw_checkpoint-3500_predict_wmt_generated_predictions.jsonl.ppl.std.json


2037it [00:00, 22395.45it/s]


Lexical Density: 0, Length Variety: 0.3767721269648037
merge_trans_checkpoint-3500_predict_wmt_generated_predictions.jsonl.ppl.std.json


2037it [00:00, 21673.49it/s]


Lexical Density: 0, Length Variety: 0.4059985250566517
merge_pol_checkpoint-3500_predict_deen_generated_predictions.jsonl.ppl.std.json


100it [00:00, 1433.30it/s]


Lexical Density: 0, Length Variety: 0.08022164224523846
merge_trans_checkpoint-3500_predict_deen_generated_predictions.jsonl.ppl.std.json


100it [00:00, 1423.45it/s]

Lexical Density: 0, Length Variety: 0.07748669704625583
{'merge_pol_checkpoint-3500_predict_wmt_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0.4660532085327173}, 'merge_trans_checkpoint-3500_predict_deen_wmt_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0.15308827343544776}, 'merge_pol_checkpoint-3500_predict_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0.7173388985989072}, 'merge_trans_checkpoint-3500_predict_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0.648249274780602}, 'merge_pol_checkpoint-3500_predict_deen_wmt_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0.16479223973499815}, 'merge_raw_checkpoint-3500_predict_deen_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0.07860129601380215}, 'merge_raw_checkpoint-3500_predict_deen_wmt_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0.15028778091236628}, 'merge_raw_checkpoint-3500_predict_generated_predictions.jsonl.ppl.std.json': {'len_variety': 0


