In [1]:
# !python -m pip install pip==24.0
# !pip install -q torch==2.0.0 --extra-index-url https://download.pytorch.org/whl/cu118 --user
# !pip install -q transformers fairseq --user

# Generation

In [2]:
from fairseq.data import Dictionary
from transformers import AutoConfig, AutoTokenizer

model_id = "Qwen/Qwen2-0.5B"
filename = None

config = AutoConfig.from_pretrained(model_id, gguf_file=filename, force_download=False)
tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename, force_download=False)
vocab = tokenizer.get_vocab()

fairseq_dictionary = Dictionary()
fairseq_dictionary.symbols = []
for token in vocab.keys():
    fairseq_dictionary.add_symbol(token)

print(f"Number of tokens in Fairseq dictionary: {len(fairseq_dictionary)}")

Number of tokens in Fairseq dictionary: 151643


In [3]:
import os
import torch
from collections import OrderedDict
from transformers import Qwen2ForCausalLM

os.makedirs("weights", exist_ok=True)
weights_path = os.path.join("weights", os.path.split(model_id)[-1])
_ = Qwen2ForCausalLM.from_pretrained(model_id, gguf_file=filename)
torch.save(_.state_dict(), weights_path)

In [4]:
from qwen2 import Qwen2ForCausalLM

state_dict = torch.load(weights_path, weights_only=True)

model = Qwen2ForCausalLM(fairseq_dictionary, config)
model.from_state_dict(state_dict)

In [5]:
texts = [
    "What is attention mask in language model?",
    "Why language models are successful?",
    "Which paper is fundamental in nowadays NLP?",
]

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tok_outs = tokenizer(texts, padding=True, return_tensors="pt")
input_ids = tok_outs.input_ids
attention_mask = tok_outs.attention_mask

generate_ids = model.generate(input_ids, attention_mask=attention_mask,
                              max_new_tokens=150, do_sample=True, top_k=50, top_p=0.95)
gen_texts = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

for _ in gen_texts:
    print(_)
    print("_________________________________________________")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is attention mask in language model?lme is an example of generative model of language. When we create it with attention mechanism for each language token, language model outputs its corresponding target tokens as well as its attention weights. 
I wonder what is attention mask and what does it mean?

Attention mask means a special mask for attention mechanism. For example, the mask in language model:
```css
[[0]   [0]   [0]  [0]   [0]   [0]   [0]  [0]   [0]
 [0]   [0]   [0]  [0]   [0]   [0]   [0]  [0]   [0]
 [0]   [0]
_________________________________________________
Why language models are successful? languages are good tools for modeling languages. The problem is how to use them to do good tasks like language generation (e.g. translation, summarization, word embeddings, etc.).
The simplest examples of generating texts from natural language are to use the generated text of an English dictionary, a random translation from a language to English or even a machine translation from one

In [6]:
model(input_ids, attention_mask)

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.4256,  4.8533,  3.4071,  ..., -4.8739, -4.8724, -4.8735],
         [ 1.0334,  2.2287,  1.7907,  ..., -5.4180, -5.4191, -5.4189],
         [ 7.9500,  5.3180,  4.9196,  ..., -4.9389, -4.9381, -4.9378],
         ...,
         [ 6.6885,  4.1731,  2.9295,  ..., -5.6208, -5.6225, -5.6211],
         [ 4.6720,  2.8864,  2.7626,  ..., -5.7757, -5.7758, -5.7749],
         [ 6.9488,  5.8876,  6.5815,  ..., -3.2889, -3.2888, -3.2876]],

        [[ 7.8377,  3.2649,  2.3366,  ..., -5.3130, -5.3133, -5.3130],
         [ 5.4114,  5.7092,  3.6707,  ..., -5.2595, -5.2586, -5.2577],
         [ 7.5345,  7.0438,  3.4638,  ..., -4.9855, -4.9854, -4.9849],
         ...,
         [ 7.0046,  5.6227,  7.7066,  ..., -3.6718, -3.6726, -3.6713],
         [ 7.2191,  5.7511,  7.5483,  ..., -3.6273, -3.6283, -3.6269],
         [ 7.0624,  5.7042,  6.8024,  ..., -3.6658, -3.6667, -3.6655]],

        [[ 4.6012,  2.8672,  1.2418,  ..., -4.7340, -4.7324, -4.7334],
    