In [1]:
%env PYTORCH_ENABLE_MPS_FALLBACK=1
# MPS fallback for torch CPU
# https://github.com/pytorch/pytorch/issues/77764
# https://stackoverflow.com/a/72416727
# Notice: This is not a permanent solution, it's a workaround for the time being.
# It must be run as the first line of the script.
# DEVICE = "mps"
DEVICE = "npu"

env: PYTORCH_ENABLE_MPS_FALLBACK=1


In [2]:
from applyllm.accelerators import (
    AcceleratorHelper,
    # DIR_MODE_MAP
)
from applyllm.utils import (
    time_func
)
import os, sys

# path for windows
from win_patch import (
    DirectorySetting,
    DIR_MODE_MAP
)
# TODO: rename the init_mps_torch to init_env_torch(dir_setting: DirectorySetting)
AcceleratorHelper.init_mps_torch(dir_setting=DIR_MODE_MAP["win_local"])

print(os.environ['XDG_CACHE_HOME'])

OSError: [WinError 126] The specified module could not be found. Error loading "c:\Users\yingdingwang\Documents\VENV\gpt3.12\Lib\site-packages\torch\lib\aoti_custom_ops.dll" or one of its dependencies.

In [3]:
# !export PYTORCH_ENABLE_MPS_FALLBACK=1

In [None]:
from intel_npu_acceleration_library import NPUModelForCausalLM, int8
from intel_npu_acceleration_library.compiler import CompilerConfig
import torch

In [21]:
# from transformers import GPT2LMHeadModel

## print gpt2 model architecture
w weight for token embedding [50257, 768] - look up table for tokens:
* 50257 tokens in gpt2 vocabulary, 
* 768 dimension embedding for each token - distriburted representation stands in for that token

w weight for positions embedding [1024, 768] - look up table for positions:
* gpt-2 has max token length of 1024 token - 1024 positions each token can be attending to in the past
* 768 parameters as position embedding for each of this position is learned by optimization

```
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
```
are weights and biases of the transformer



In [22]:
compiler_conf = CompilerConfig(dtype=int8)
compiler_conf = CompilerConfig(dtype=torch.float16)

In [None]:
# model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # 124M parameters
model_hf = NPUModelForCausalLM.from_pretrained("gpt2", config=compiler_conf) # 124M parameters
sd_hf = model_hf.state_dict() # raw tensors 

for k, v in sd_hf.items():
    # v is the tensor (values)
    print(k, v.shape)

#### view weight tensor
flatten the tensor weights in Pytorch, flatten it, and slice the last 20 float values.

In [None]:
# sd_hf["transformer.wpe.weight"].view(-1) flatten the tensor from Pytorch to 1D
sd_hf["transformer.wpe.weight"].view(-1)[:20]

#### Plot the weights values of position embeddings
There are structures in the position embeddings of gpt-2 tokens
* Y axis is the position, each row indicate a position of 1024 in gpt-2 input
* X axis is the representation of the position

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")

#### Plot columns of the position embedding weights
We look at the channel as a function of position from [0, 1023]

The channels are more or less response to the position
* green channel like to fire for every position after 200 till 800
* since the channel position function are noicy/jagged, you can tell this model is not fully trained
* The more trained this model was, the more you would expect channel position function to smooth out
* At the beginning of the optimization, this curves are complete random noise, because the position embedding table is initialized completely at random
* The channel curve of position embedding looks sinusoidal like

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 150])
plt.plot(sd_hf["transformer.wpe.weight"][:, 200])
plt.plot(sd_hf["transformer.wpe.weight"][:, 250])

In [None]:
# plot the attention matrix of the first layer with a block of 300x300
plt.imshow(sd_hf["transformer.h.1.attn.c_attn.weight"][:300,:300], cmap="gray")
# there is some structure in the attention matrix

# Set seed for transformer to get the generation fixed

With `set_seed(42)`
you shall see the model allways generate the same output:

```console
[{'generated_text': "Hello, I'm a language model, I'm writing a new language for you. But first, I'd like to tell you about the language itself"},
 {'generated_text': "Hello, I'm a language model, and I'm trying to be as expressive as possible. In order to be expressive, it is necessary to know"},
 {'generated_text': "Hello, I'm a language model, so I don't get much of a license anymore, but I'm probably more familiar with other languages on that"},
 {'generated_text': "Hello, I'm a language model, a functional model... It's not me, it's me!\n\nI won't bore you with how"},
 {'generated_text': "Hello, I'm a language model, not an object model.\n\nIn a nutshell, I need to give language model a set of properties that"}]
 ```

 Notice:
 * if I change the device from CPU to MPS, the results will be different.

In [None]:
# get a pipeline to use the model to sample the text
from transformers import pipeline, set_seed 
# generator = pipeline('text-generation', model="gpt2", device_map=DEVICE) # use the default model

'''
set the seed for reproducibility, if seed is set. the output will be the same
if the code of weights is the same, with fixed seed, the output will be the same
'''
# set_seed(42)

'''
# "Hello..." is prefix, sampling 30 tokens, and return 5 output sequences
# 5 different completions of the prefix
# Truncation was not explicitly activated but `max_length` is provided a specific value, 
# please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. 
# If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely
# by providing a specific strategy to `truncation`.
# Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
'''
# generator("Hello, I'm a language model,", max_length=30, truncation=True, num_return_sequences=5)

In [29]:
@time_func
def chat(prompt="Hello, I'm a language model,", max_length=30, num_return_sequences=5, device_map="cpu"):
    generator = pipeline('text-generation', model="gpt2", device_map=device_map)
    set_seed(42)
    return generator(prompt, max_length=max_length, truncation=True, num_return_sequences=num_return_sequences)

In [None]:
chat("Hello, I'm a language model,", max_length=30, num_return_sequences=5, device_map="cpu")

In [31]:
prompt = "what's your name?"

In [None]:
# chat(prompt=prompt, max_length=30, num_return_sequences=5, device_map="npu")
chat(prompt=prompt, max_length=30, num_return_sequences=5, device_map="balanced")