In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline

#load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map = 'cuda',
    torch_dtype = "auto",
    trust_remote_code = True
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.03s/it]


In [7]:
#create a pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text = True,
    max_new_tokens = 50,
    do_sample = False #randomness
)

### Input and output of a trained transformer LLM

In [8]:
prompt = "Write an email to Sarah for the tragic gardening mishap."
output = generator(prompt)

print(output[0]['generated_text'])

Write an email to Sarah for the tragic gardening mishap.

Subject: Gardening Mishap - A Tale of Tragedy

Dear Sarah,

I hope this email finds you well. I am writing to share with you a rather unfortunate incident that occurred in my


In [9]:
print(model)

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, out_features=3206

### Choosing a single token from the probabilty distribution (sampling/decoding)

In [19]:
prompt = "The capital of France is"

#tokenize the input ptompt
input_ids = tokenizer(prompt,return_tensors='pt').input_ids.to("cuda")
# input_ids = input_ids.to("cuda")

#get the output of the model before lm_head
model_output = model.model(input_ids)

#get the output of the lm_head
lm_head_output = model.lm_head(model_output[0])

In [20]:
print("model_output: ",model_output)

model_output:  BaseModelOutputWithPast(last_hidden_state=tensor([[[-0.3477,  1.0781,  0.1367,  ..., -0.2617,  0.7422,  0.1514],
         [-0.1318,  0.2910,  0.3496,  ...,  0.5547, -0.1338, -0.6016],
         [-0.5859,  1.0703,  1.5469,  ..., -0.4238,  0.2676,  0.4062],
         [-0.3926,  0.6680,  0.2246,  ..., -0.0562,  0.8164, -0.6797],
         [-0.6719,  0.6914,  0.5234,  ...,  0.2471,  0.2002, -0.2773]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<MulBackward0>), past_key_values=((tensor([[[[-2.9688e-01,  1.3770e-01, -7.8125e-02,  ..., -9.1797e-02,
           -2.6123e-02,  5.8350e-02],
          [ 8.6914e-02,  9.7656e-02, -1.3672e-01,  ..., -2.3926e-02,
           -1.0840e-01, -2.7100e-02],
          [ 4.0820e-01,  4.1504e-02, -1.9141e-01,  ..., -1.5137e-01,
           -2.1094e-01,  8.7891e-02],
          [ 2.5977e-01, -3.2227e-02, -3.8330e-02,  ..., -1.6602e-02,
           -1.2268e-02,  6.3782e-03],
          [ 9.8633e-02, -9.8633e-02, -7.1289e-02,  ..., -1.9336e-01,


In [21]:
print("lm_head_output: ",lm_head_output)

lm_head_output:  tensor([[[24.5000, 23.8750, 22.3750,  ..., 18.5000, 18.3750, 18.3750],
         [31.0000, 31.3750, 25.8750,  ..., 25.8750, 25.8750, 25.8750],
         [31.2500, 28.6250, 30.8750,  ..., 26.1250, 26.0000, 26.1250],
         [33.0000, 31.7500, 36.0000,  ..., 27.7500, 27.7500, 27.7500],
         [27.7500, 29.5000, 28.0000,  ..., 20.3750, 20.3750, 20.3750]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>)


In [17]:
lm_head_output.shape

torch.Size([1, 6, 32064])

In [22]:
token_id = lm_head_output[0,-1].argmax(-1)
tokenizer.decode(token_id)

'Paris'

In [23]:
model_output[0].shape

torch.Size([1, 5, 3072])