In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import os
import numpy as np

# print(f"NumPy version: {np.__version__}")
# print(f"Torch version: {torch.__version__}")

# Set proxy for requests library
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:1087'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:1087'

"""

pip install ipywidgets

then restart jupyter, will show progress bar
"""

model_name='Qwen/Qwen2.5-0.5B-Instruct'

def get_device():
    if torch.backends.mps.is_available():
        # Check macOS version compatibility
        import platform
        mac_version = platform.mac_ver()[0].split('.')
        
        # If macOS version is less than 14.0, fall back to CPU
        if int(mac_version[0]) < 14:
            print("MPS limited on macOS < 14.0. Falling back to CPU.")
            return torch.device("cpu")
        
        return torch.device("mps")
    else:
        return torch.device("cpu")

# Explicitly set default device
# torch.set_default_device('cpu')  # or 'mps' for Mac M1/M2

# Determine the best device
device=get_device()
print(f"Using device: {device}")

model=AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype=torch.float16, # todo: use 4bit quant
  # device_map='auto',
  device_map={"": device},
  trust_remote_code=True,
  low_cpu_mem_usage=True
)

tokenizer=AutoTokenizer.from_pretrained(model_name)

prompt='give me a short instruction to large language model'
messages=[
  {'role': 'system', 'content': 'You are Qwen, a helpful AI assistant'},
  {'role': 'user', 'content': prompt}
]

text=tokenizer.apply_chat_template(
  messages,
  tokenize=False,
  add_generation_prompt=True
)

print(text, '\n')

# prepare model inputs
model_inputs=tokenizer([text], return_tensors='pt').to(model.device)
print(model_inputs, '\n')

# token by token generation
def generate_tokens(model, tokenizer, model_inputs, max_new_tokens=512):
    # Start generation
    model.eval()
    with torch.no_grad():
        # Initial generation settings
        generation_config = {
            'max_new_tokens': max_new_tokens,
            'do_sample': True,
            'temperature': 0.7,
            'top_p': 0.9,
            'pad_token_id': tokenizer.pad_token_id,
        }
        
        # Create a stream generator
        generated_ids = model.generate(
            **model_inputs, 
            **generation_config,
            output_attentions=False,
            output_hidden_states=False,
            return_dict_in_generate=True,
            synced_gpus=False,
        )
        
        # Iterate through tokens
        for token in generated_ids.sequences[0][len(model_inputs.input_ids[0]):]:
            # Decode the single token
            decoded_token = tokenizer.decode(token.unsqueeze(0), skip_special_tokens=True)
            
            # Print the token (flush to see real-time output)
            print(decoded_token, end='', flush=True)
        
        print()  # New line after generation
  
# generate all tokens once
def gen_completion():
  # Generate response
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=512,
      do_sample=False,  # Enable sampling for more diverse output
      # temperature=0.7,  # Moderate randomness
      # top_p=0.9        # Nucleus sampling
  )

  # Decode only the newly generated tokens
  generated_ids = [
      output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
  ]

  # Decode and return response
  response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

  print('resp: ', response)


# generate_tokens(model, tokenizer, model_inputs)

def run_pipeline():
    generator=pipeline(
        'text-generation',
        model=model,
        tokenizer=tokenizer,
        return_full_text=False,
        max_new_tokens=500,
        do_sample=False
    )
    messages=[
        {'role': 'user', 'content': 'tell me a joke about chickens'}
    ]
    output=generator(messages)
    print(output[0]['generated_text'])

run_pipeline()


MPS limited on macOS < 14.0. Falling back to CPU.
Using device: cpu


Device set to use cpu


<|im_start|>system
You are Qwen, a helpful AI assistant<|im_end|>
<|im_start|>user
give me a short instruction to large language model<|im_end|>
<|im_start|>assistant
 

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,    264,
          10950,  15235,  17847, 151645,    198, 151644,    872,    198,  46430,
            752,    264,   2805,   7600,    311,   3460,   4128,   1614, 151645,
            198, 151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])} 

Sure! Here's a classic chicken joke for you:

Why did the chicken cross the road?

To get to the other side!

I hope that brings a smile to your face!
