In [40]:
# Warning control
import warnings
import torch

warnings.filterwarnings('ignore')

## Loading the LLM

In [41]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [42]:
device = torch.accelerator.current_accelerator() \
    if torch.accelerator.is_available() \
    else torch.device('cpu')

In [43]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cpu",
    torch_dtype="auto",
    trust_remote_code=False,  # Use official transformers implementation
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.11it/s]


In [44]:
# Wrap the model and the tokenizer in a pipeline object that has "text-generation" as task
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,  # Not include the prompt text in the returned text
    max_new_tokens=50,
    do_sample=False,  # No randomness in the generated text
)

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


## Generating a Text Response to a Prompt

In [45]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened. "

output = generator(prompt)

print(output[0]['generated_text'])

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.





Email to Sarah:

Subject: Sincere Apologies for the Gardening Mishap


Dear Sarah,


I hope this message finds you well. I am writing to express my deepest ap


In [46]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (norm): Phi3RMSNorm((3072,), eps=1e-05)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (lm_head): Linear(in_features=3072, out_features=32064, 

In [47]:
model.model.embed_tokens

Embedding(32064, 3072, padding_idx=32000)

In [48]:
model.model

Phi3Model(
  (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
  (layers): ModuleList(
    (0-31): 32 x Phi3DecoderLayer(
      (self_attn): Phi3Attention(
        (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
      )
      (mlp): Phi3MLP(
        (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
        (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
        (activation_fn): SiLU()
      )
      (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
      (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
      (resid_attn_dropout): Dropout(p=0.0, inplace=False)
      (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
    )
  )
  (norm): Phi3RMSNorm((3072,), eps=1e-05)
  (rotary_emb): Phi3RotaryEmbedding()
)

In [49]:
model.model.layers[0]

Phi3DecoderLayer(
  (self_attn): Phi3Attention(
    (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
    (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
  )
  (mlp): Phi3MLP(
    (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
    (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
    (activation_fn): SiLU()
  )
  (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
  (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
  (resid_attn_dropout): Dropout(p=0.0, inplace=False)
  (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
)

## Generating a Single Token to a Prompt

In [50]:
prompt = "The capital of France is"

In [51]:
# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids

tensor([[ 450, 7483,  310, 3444,  338]])

In [52]:
# Get the output of the model before the lm_head
model_output = model.model(input_ids)

In [53]:
# Get the shape the output the model before the lm_head
model_output[0].shape

torch.Size([1, 5, 3072])

In [54]:
# Get the output of the lm_head
lm_head_output = model.lm_head(model_output[0])

In [55]:
lm_head_output.shape

torch.Size([1, 5, 32064])

In [56]:
token_id = lm_head_output[0,-1].argmax(-1)
token_id

tensor(3681)

In [57]:
tokenizer.decode(token_id)

'Paris'