In [1]:
import numpy as np
from tqdm.auto import tqdm
from transformers import AutoTokenizer

In [22]:
import time
from functools import wraps
def time_it(func):
    @wraps(func)
    def wrapper(*args, **kw):
        start_time = time.time()
        result = func(*args, **kw)
        end_time = time.time()
        print("函数的运行时间(s)：{}".format(end_time - start_time))
        print("函数的运行时间(ms)：{}".format(float(end_time - start_time) * 1000.0))
        return result
    return wrapper

In [27]:

def get_tok_lens(model_name, raw_data):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token='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', trust_remote_code=True)
    start_time = time.time()
    tokenizer = [len(tokenizer.tokenize(x)) for x in tqdm(raw_data)]
    end_time = time.time()
    run_time = end_time - start_time
    print("函数的运行时间(s)：{}".format(run_time))
    tok_lens = np.mean(tokenizer)
    print("{} 平均字符数: {:2f}".format(model_name, tok_lens))
    return model_name, tok_lens, run_time

In [38]:
def print_col(lst):
    # lst_str = (str(x) for x in lst)
    for l in lst:
        print("{:.2f}".format(l))
    # print("\n".join(lst_str))

In [30]:
raw_data = open("sents_17w.txt").readlines()
chars = [len(x) for x in raw_data]
print("平均字数: {:2f}, 样本数量: {:_}".format(np.mean(chars), len(chars)))

平均字数: 59.381887, 样本数量: 179_608


In [31]:
model_list = ["Qwen/Qwen-14B", "bigscience/bloom", "meta-llama/Llama-2-70b-hf", "baichuan-inc/Baichuan-7B", \
              "tiiuae/falcon-180B", "internlm/internlm-20b", "FlagAlpha/Llama2-Chinese-13b-Chat", "IDEA-CCNL/Ziya-LLaMA-13B-v1"]

In [32]:
tok_lens_lst, run_times = [], []
for model_name in model_list:
    _, tok_lens, run_time = get_tok_lens(model_name, raw_data)
    tok_lens_lst.append(tok_lens)
    run_times.append(run_time)



  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：14.164180040359497
Qwen/Qwen-14B 平均字符数: 41.459946


  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：33.18517804145813
bigscience/bloom 平均字符数: 36.955403


  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：22.894792079925537
meta-llama/Llama-2-70b-hf 平均字符数: 83.610479


  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：17.05532193183899
baichuan-inc/Baichuan-7B 平均字符数: 46.343676


  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：48.0962598323822
tiiuae/falcon-180B 平均字符数: 72.893217


  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：18.917821168899536
internlm/internlm-20b 平均字符数: 39.653200


Downloading (…)okenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：22.449259042739868
FlagAlpha/Llama2-Chinese-13b-Chat 平均字符数: 83.610479


Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/589k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.99M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/476 [00:00<?, ?B/s]

  0%|          | 0/179608 [00:00<?, ?it/s]

函数的运行时间(s)：19.310914754867554
IDEA-CCNL/Ziya-LLaMA-13B-v1 平均字符数: 58.598108


In [39]:
print_col(tok_lens_lst)

41.46
36.96
83.61
46.34
72.89
39.65
83.61
58.60


In [40]:
print_col(run_times)

14.16
33.19
22.89
17.06
48.10
18.92
22.45
19.31
