In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

#load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map = "cuda",
    torch_dtype = 'auto',
    trust_remote_code = True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards: 100%|██████████| 2/2 [13:52<00:00, 416.18s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.73s/it]


In [2]:
from transformers import pipeline

#create a pipeline
generator = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    return_full_text = False, #the prompt will not be returned but merely the output of the model
    max_new_tokens = 500, #maximum number of tokens the model will generate
    do_sample = False #setting this to False, the model will always select the next most probable token
)

In [3]:
# The prompt (user input / query)
messages = [
    {"role": "user", "content": "Create a funny joke about computer science."}
]

# Generate output
output = generator(messages)
print(output[0]["generated_text"])

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
You are not running the flash-attention implementation, expect numerical differences.


 Why do programmers always mix up Christmas and Halloween?


Because Oct 31 equals Dec 25!


In [7]:
prompt = "Write an email apologizing to Sarah for tragic garden mishap.Explain how it happened.<|assistant|>"

#tokenize the input prompt
input_ids = tokenizer(prompt,return_tensors="pt").input_ids.to("cuda")
print(input_ids)

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363, 25305,   293,
         16423,   286,   728,   481, 29889,  9544,  7420,   920,   372,  9559,
         29889, 32001]], device='cuda:0')


In [16]:
generated_output = model.generate(
    input_ids = input_ids,
    max_new_tokens = 20
)
output = tokenizer.decode(generated_output[0])
print(output)

Write an email apologizing to Sarah for tragic garden mishap.Explain how it happened.<|assistant|> Subject: Sincere Apologies for the Garden Incident


Dear Sarah,


In [14]:
for id in input_ids[0]:
    print(tokenizer.decode(id))

Write
an
email
apolog
izing
to
Sarah
for
trag
ic
garden
m
ish
ap
.
Exp
lain
how
it
happened
.
<|assistant|>


In [18]:
generated_ids = tokenizer(output,return_tensors="pt").input_ids.to("cuda")
print(generated_ids)

tensor([[14350,   385,  4876, 27746,  5281,   304, 19235,   363, 25305,   293,
         16423,   286,   728,   481, 29889,  9544,  7420,   920,   372,  9559,
         29889, 32001,  3323,   622, 29901,   317,  3742,   406,  6225, 11763,
           363,   278, 19906,  9266,  1693,    13,    13,    13, 29928,   799,
         19235, 29892]], device='cuda:0')


In [21]:
word = tokenizer.decode(13)
print(word)



