In [None]:
import transformers
import torch

# model_id = "meta-llama/Llama-3.1-8B"
model_id = "meta-llama/Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation", 
    model=model_id, 
    model_kwargs={"torch_dtype": torch.bfloat16}, 
    device="mps"
    # device_map="auto",
)

Downloading shards: 100%|██████████| 4/4 [06:27<00:00, 96.97s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  4.34it/s]
Device set to use mps


In [2]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Is my cat cute?"},
]

In [47]:
import time 

start_time = time.time()
output = pipeline(text_inputs=messages, max_new_tokens=100)
end_time = time.time()

output

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': [{'role': 'system',
    'content': 'You are a helpful assistant.'},
   {'role': 'user', 'content': 'Is my cat cute?'},
   {'role': 'assistant',
    'content': "I'm sure your cat is adorable in your eyes. Cats are known for their unique personalities and characteristics, which can make them quite endearing to their owners. Since I don't have a visual of your cat, I'll have to rely on your description.\n\nCan you tell me a bit more about your cat? What's their breed, color, or any distinctive features they might have? I'd be happy to help you describe just how cute they are!"}]}]

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
generated_text = output[0]["generated_text"]

generated_text_str = str(generated_text) 
generated_tokens = tokenizer(generated_text_str, return_tensors="pt")
num_tokens = generated_tokens["input_ids"].shape[1]

total_time = end_time - start_time
tokens_per_sec = num_tokens / total_time if total_time > 0 else 0

print("Generated text:", generated_text)
print(f"Tokens generated: {num_tokens}")
print(f"Time taken: {total_time:.2f} seconds")
print(f"Tokens per second: {tokens_per_sec:.2f} t/s")


Generated text: [{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Is my cat cute?'}, {'role': 'assistant', 'content': "I'm sure your cat is adorable in your eyes. Cats are known for their unique personalities and characteristics, which can make them quite endearing to their owners. Since I don't have a visual of your cat, I'll have to rely on your description.\n\nCan you tell me a bit more about your cat? What's their breed, color, or any distinctive features they might have? I'd be happy to help you describe just how cute they are!"}]
Tokens generated: 140
Time taken: 12.15 seconds
Tokens per second: 11.52 t/s


# Flann T5

In [16]:
import transformers

model_id = "google/flan-t5-base"
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to("mps")
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

In [22]:
prompt = """Summarize the following text:\n\n
    #Person1#: Have you considered upgrading your system?
    #Person2#: Yes, but I'm not sure what exactly I would need.
    #Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
    #Person2#: That would be a definite bonus.
    #Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
    #Person2#: How can we do that?
    #Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
    #Person2#: No.
    #Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
    #Person2#: That sounds great. Thanks.
    Summary:
"""
inputs = tokenizer(prompt, return_tensors='pt')
inputs


{'input_ids': tensor([[12198,  1635,  1737,     8,   826,  1499,    10,  1713,   345, 13515,
           536,  4663,    10,  2114,    25,  1702, 21066,    39,   358,    58,
          1713,   345, 13515,   357,  4663,    10,  2163,     6,    68,    27,
            31,    51,    59,   417,   125,  1776,    27,   133,   174,     5,
          1713,   345, 13515,   536,  4663,    10,   148,   228,  1099,  2651,
             3,     9,  3924,   478,    12,    39,   889,     5,    94,   133,
           995,    25,    12,   143,    95,    39,   293,  3971,   277,    11,
         11662,     7,    21,  3662,     5,  1713,   345, 13515,   357,  4663,
            10,   466,   133,    36,     3,     9,     3, 14339,  4023,     5,
          1713,   345, 13515,   536,  4663,    10,   148,   429,    92,   241,
            12,  5941,    39,  4214,   250,    34,    19,  1134, 21643,   230,
             5,  1713,   345, 13515,   357,  4663,    10,   571,    54,    62,
           103,    24,    58,  1713,  

In [23]:
model_output = model.generate(inputs['input_ids'].to("mps"), max_length=200)
model_output

tensor([[    0,  1713,   345, 13515,   536,  4663,    10,  2114,    25,  1702,
         21066,    39,   358,    58,  1713,   345, 13515,   357,  4663,    10,
          2163,     6,    68,    27,    31,    51,    59,   417,   125,  1776,
            27,   133,   174,     5,  1713,   345, 13515,   536,  4663,    10,
           148,   228,   617,     3,     9,  3924,   478,    12,    39,   889,
             5,  1713,   345, 13515,   357,  4663,    10,   148,   228,    92,
          5941,    39,  4214,     5,  1713,   345, 13515,   536,  4663,    10,
           148,   228,    92,   617,     3,     9,  3190,    18, 13103,  1262,
             5,     1]], device='mps:0')

In [24]:
output = tokenizer.decode(model_output[0], skip_special_tokens=True)
output

"#Person1#: Have you considered upgrading your system? #Person2#: Yes, but I'm not sure what exactly I would need. #Person1#: You could add a painting program to your software. #Person2#: You could also upgrade your hardware. #Person1#: You could also add a CD-ROM drive."

# Stable Diffusion

In [12]:
import torch
from diffusers import StableDiffusion3Pipeline

pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3.5-large", torch_dtype=torch.bfloat16)
pipe = pipe.to("mps")

image = pipe(
    "A capybara holding a sign that reads Hello World",
    num_inference_steps=28,
    guidance_scale=3.5,
).images[0]
image.save("capybara.png")

Fetching 28 files: 100%|██████████| 28/28 [07:33<00:00, 16.18s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.01s/it]s/it]
Loading pipeline components...: 100%|██████████| 9/9 [00:21<00:00,  2.37s/it]


KeyboardInterrupt: 

# Flux.1d

In [None]:
import torch
from diffusers import FluxPipeline

print('Loading model...')
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
pipe = pipe.to('mps')
# pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power
print('Model loaded.')

print('Generating...')
prompt = "A cat holding a sign that says hello world"
image = pipe(
    prompt,
    height=256,
    width=256,
    guidance_scale=3.5,
    num_inference_steps=5,
    max_sequence_length=512,
    generator=torch.Generator("mps").manual_seed(0)
).images[0]
image.save("flux-dev.png")


Loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.95it/s]it/s]
Loading pipeline components...: 100%|██████████| 7/7 [00:01<00:00,  4.48it/s]


KeyboardInterrupt: 