## Instalación e Importación de Librerias

In [1]:
!pip install -q transformers

In [2]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

## Login con HuggingFace

In [3]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

## Definición de Modelos y Tokenizers

In [6]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [7]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

## Texto para las Pruebas

In [15]:
text = "Probando a generar tokens a partir de texto y verificar el numero de tokens generados por modelo"
code = """
def helloWorld(user: str):
    print(f"Hello, {user}!")
"""
messages = [
    {"role": "system", "content": "Eres un asistente muy polivalente"},
    {"role": "user", "content": "Cuenta un chiste divertido para una sala llena de amantes de los perros."}
  ]

## Tokens con los Modelos

In [None]:
phi3_tokens = phi3_tokenizer.encode(text)

print(phi3_tokens)
print("-"*50)
print(phi3_tokenizer.batch_decode(phi3_tokens))
print("-"*50)
print(len(phi3_tokens))

In [11]:
qwen2_tokens = qwen2_tokenizer.encode(text)

print(qwen2_tokens)
print("-"*50)
print(qwen2_tokenizer.batch_decode(qwen2_tokens))
print("-"*50)
print(len(qwen2_tokens))

[36980, 4883, 264, 94979, 11211, 264, 30532, 409, 32025, 379, 91819, 655, 20050, 409, 11211, 1766, 5553, 4154, 35942]
--------------------------------------------------
['Prob', 'ando', ' a', ' generar', ' tokens', ' a', ' partir', ' de', ' texto', ' y', ' verificar', ' el', ' numero', ' de', ' tokens', ' gener', 'ados', ' por', ' modelo']
--------------------------------------------------
19


In [14]:
starcoder2_tokens = starcoder2_tokenizer.encode(code)

print(starcoder2_tokens)
print("-"*50)
print(starcoder2_tokenizer.batch_decode(starcoder2_tokens))
print("-"*50)
print(len(starcoder2_tokens))

[222, 610, 17966, 6719, 45, 514, 63, 615, 731, 303, 1489, 45, 107, 39, 8302, 49, 320, 514, 130, 16013, 222]
--------------------------------------------------
['\n', 'def', ' hello', 'World', '(', 'user', ':', ' str', '):', '\n   ', ' print', '(', 'f', '"', 'Hello', ',', ' {', 'user', '}', '!")', '\n']
--------------------------------------------------
21


## Token en los Prompts de los Modelos Qwen2 y Phi3

In [16]:
phi3_chat_template = phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
qwen2_chat_template = qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [17]:
print(phi3_chat_template)
print("-"*50)
print(qwen2_chat_template)

<|system|>
Eres un asistente muy polivalente<|end|>
<|user|>
Cuenta un chiste divertido para una sala llena de amantes de los perros.<|end|>
<|assistant|>

--------------------------------------------------
<|im_start|>system
Eres un asistente muy polivalente<|im_end|>
<|im_start|>user
Cuenta un chiste divertido para una sala llena de amantes de los perros.<|im_end|>
<|im_start|>assistant

