In [1]:
# !python -m pip install pip==24.0
# !pip install -q torch==2.0.0 --extra-index-url https://download.pytorch.org/whl/cu118 --user
# !pip install -q transformers fairseq --user

# Generation

In [2]:
from omegaconf import OmegaConf
from model import HuggingFaceQwen2ForCausalLM

args = OmegaConf.create()
args.llm_config = "Qwen/Qwen2-0.5B"

model = HuggingFaceQwen2ForCausalLM(args)

2024-12-30 16:16:10 | INFO | speechgpt_logger | Loading model from Qwen/Qwen2-0.5B
2024-12-30 16:16:11 | INFO | speechgpt_logger | Initializing Qwen2Decoder with 24 layers.
2024-12-30 16:16:14 | INFO | speechgpt_logger | Qwen2Decoder initialized successfully.
2024-12-30 16:16:15 | INFO | speechgpt_logger | Model initialized successfully.
2024-12-30 16:16:15 | INFO | speechgpt_logger | Loading model weights.
2024-12-30 16:16:19 | INFO | speechgpt_logger | Loaded model weights.


In [3]:
from transformers import AutoTokenizer


texts = [
    "What is attention mask in language model?",
    "Why language models are successful?",
    "Which paper is fundamental in nowadays NLP?",
]

tokenizer = AutoTokenizer.from_pretrained(args.llm_config)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tok_outs = tokenizer(texts, padding=True, return_tensors="pt")
input_ids = tok_outs.input_ids
attention_mask = tok_outs.attention_mask

generate_ids = model.generate(input_ids, attention_mask=attention_mask,
                              max_new_tokens=150, do_sample=True, top_k=50, top_p=0.95)
gen_texts = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

for _ in gen_texts:
    print(_)
    print("_________________________________________________")

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


What is attention mask in language model?in language model?

Attention mask is a computational device or filter that is used to reduce the dimensionality of a set of tokens in the context in which they appear in a sentence. It is often used in natural language processing (NLP) to improve the accuracy of the model in predicting future words or sentences. It is also known as an attention mask or attention mechanism. The idea is to capture the importance of the context in the passage and make the words more attentional.
_________________________________________________
Why language models are successful?
The concept of "Language Models" is quite controversial and complicated, but one possible explanation is to consider that the "true" language model, the word model, is a particular special kind of hidden Markov Model. Then the question why it is so successful is the following: why are there so many languages?

You should define what's special about language models and compare them to any 

In [4]:
model(input_ids, attention_mask)

CausalLMOutputWithPast(loss=None, logits=tensor([[[ 7.4256,  4.8533,  3.4071,  ..., -4.8739, -4.8724, -4.8735],
         [ 1.0334,  2.2287,  1.7907,  ..., -5.4180, -5.4191, -5.4189],
         [ 7.9500,  5.3180,  4.9196,  ..., -4.9389, -4.9381, -4.9378],
         ...,
         [ 6.6885,  4.1731,  2.9295,  ..., -5.6208, -5.6225, -5.6211],
         [ 4.6720,  2.8864,  2.7626,  ..., -5.7757, -5.7758, -5.7749],
         [ 6.9488,  5.8876,  6.5815,  ..., -3.2889, -3.2888, -3.2876]],

        [[ 7.8377,  3.2649,  2.3366,  ..., -5.3130, -5.3133, -5.3130],
         [ 5.4114,  5.7092,  3.6707,  ..., -5.2595, -5.2586, -5.2577],
         [ 7.5345,  7.0438,  3.4638,  ..., -4.9855, -4.9854, -4.9849],
         ...,
         [ 7.0046,  5.6227,  7.7066,  ..., -3.6718, -3.6726, -3.6713],
         [ 7.2191,  5.7511,  7.5483,  ..., -3.6273, -3.6283, -3.6269],
         [ 7.0624,  5.7042,  6.8024,  ..., -3.6658, -3.6667, -3.6655]],

        [[ 4.6012,  2.8672,  1.2418,  ..., -4.7340, -4.7324, -4.7334],
    