In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import os
import numpy as np

# print(f"NumPy version: {np.__version__}")
# print(f"Torch version: {torch.__version__}")

# Set proxy for requests library
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:1087'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:1087'

"""

pip install ipywidgets

then restart jupyter, will show progress bar
"""

model_name='Qwen/Qwen2.5-0.5B-Instruct'

# Explicitly set default device
# torch.set_default_device('cpu')  # or 'mps' for Mac M1/M2

# Determine the best device
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

model=AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype=torch.float16, # todo: use 4bit quant
  # device_map='auto',
  device_map={"": device},
  # trust_remote_code=True
  low_cpu_mem_usage=True
)

tokenizer=AutoTokenizer.from_pretrained(model_name)

# model=model.to(device)

prompt='give me a short instruction to large language model'
messages=[
  {'role': 'system', 'content': 'You are Qwen, a helpful AI assistant'},
  {'role': 'user', 'content': prompt}
]

text=tokenizer.apply_chat_template(
  messages,
  tokenize=False,
  add_generation_prompt=True
)

print(text, '\n')

# prepare model inputs
model_inputs=tokenizer([text], return_tensors='pt').to(model.device)
print(model_inputs, '\n')

# token by token generation
def generate_tokens(model, tokenizer, model_inputs, max_new_tokens=512):
    # Start generation
    model.eval()
    with torch.no_grad():
        # Initial generation settings
        generation_config = {
            'max_new_tokens': max_new_tokens,
            'do_sample': True,
            'temperature': 0.7,
            'top_p': 0.9,
            'pad_token_id': tokenizer.pad_token_id,
        }
        
        # Create a stream generator
        generated_ids = model.generate(
            **model_inputs, 
            **generation_config,
            output_attentions=False,
            output_hidden_states=False,
            return_dict_in_generate=True,
            synced_gpus=False,
        )
        
        # Iterate through tokens
        for token in generated_ids.sequences[0][len(model_inputs.input_ids[0]):]:
            # Decode the single token
            decoded_token = tokenizer.decode(token.unsqueeze(0), skip_special_tokens=True)
            
            # Print the token (flush to see real-time output)
            print(decoded_token, end='', flush=True)
        
        print()  # New line after generation
  
# generate all tokens once
def gen_completion():
  # Generate response
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=512,
      do_sample=False,  # Enable sampling for more diverse output
      # temperature=0.7,  # Moderate randomness
      # top_p=0.9        # Nucleus sampling
  )

  # Decode only the newly generated tokens
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  # Decode and return response
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  print('resp: ', response)


generate_tokens(model, tokenizer, model_inputs)


Using device: mps
<|im_start|>system
You are Qwen, a helpful AI assistant<|im_end|>
<|im_start|>user
give me a short instruction to large language model<|im_end|>
<|im_start|>assistant
 

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,    264,
          10950,  15235,  17847, 151645,    198, 151644,    872,    198,  46430,
            752,    264,   2805,   7600,    311,   3460,   4128,   1614, 151645,
            198, 151644,  77091,    198]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]], device='mps:0')} 

Certainly! Here's a simple instruction for a large language model:

"Write a concise summary of the following text in a single paragraph. The text should cover key points and provide an overview of the topic."

This can be used to quickly generate summaries or summaries of long texts that need to be condensed into shorter forms.
