### Flash Attention

### Softmax Tiling

In [1]:
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x)) # subtract max(x) for numerical stability
    return e_x / e_x.sum()

s = np.array([0.1, 0.2, 0.3, 0.4])

print(softmax(s))


[0.21383822 0.23632778 0.26118259 0.28865141]


In [4]:
## 注意，s1和s2我们是不存储的，是QV的计算结果
s1 = np.array([0.1, 0.2])
s2 = np.array([0.3, 0.4])

m1 = np.max(s1)
m2 = np.max(s2)

print(m1)
print(m2)

m = np.max([m1, m2])

0.2
0.4


In [6]:
fx1 = np.exp(s1 - m1)
fx2 = np.exp(s2 - m2)

print(fx1)
print(fx2)

[0.90483742 1.        ]
[0.90483742 1.        ]


In [19]:
fx = [np.exp(m1-m) * np.exp(s1 - m1), np.exp(m2-m) * np.exp(s2 - m2)]
print(np.hstack(fx))

[0.74081822 0.81873075 0.90483742 1.        ]


In [22]:
lx = np.exp(m1 - m) * np.sum(np.exp(s1 - m1)) + np.exp(m2 - m) * np.sum(np.exp(s2 - m2))
print(lx)

3.464386391795659


In [24]:
result = np.hstack(fx)/lx
print(result)

[0.21383822 0.23632778 0.26118259 0.28865141]


In [25]:
print(softmax(s))

[0.21383822 0.23632778 0.26118259 0.28865141]


### Tiktoken

In [27]:
import tiktoken


embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(embedding_encoding)
text = "Deeplearning"
print(len(encoding.encode(text)))
print(encoding.encode(text))

4
[1951, 68, 698, 3256]


In [31]:
for id in encoding.encode(text):
    print(encoding.decode([id]))


De
e
ple
arning


### 扩充中文词表的Tokenizer

In [1]:
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"

from transformers import LlamaTokenizer
from transformers import LlamaTokenizer
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
import sentencepiece as spm



llama_tokenizer_dir = "./data/llama_tokenizer"
chinese_sp_model_file = "./data/chinese_sp_model/chinese_sp.model"

# load
llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
chinese_sp_model = spm.SentencePieceProcessor()
chinese_sp_model.Load(chinese_sp_model_file)

llama_spm = sp_pb2_model.ModelProto()
llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())
chinese_spm = sp_pb2_model.ModelProto()
chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto())

# print number of tokens
print(len(llama_tokenizer),len(chinese_sp_model))
print(llama_tokenizer.all_special_tokens)
print(llama_tokenizer.all_special_ids)
print(llama_tokenizer.special_tokens_map)

## Add Chinese tokens to LLaMA tokenizer
llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)
print(len(llama_spm_tokens_set))
print(f"Before:{len(llama_spm_tokens_set)}")
for p in chinese_spm.pieces:
    piece = p.piece
    if piece not in llama_spm_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = 0
        llama_spm.pieces.append(new_p)
print(f"New model pieces: {len(llama_spm.pieces)}")

## Save
output_sp_dir = 'merged_tokenizer_sp'
output_hf_dir = 'merged_tokenizer_hf' # the path to save Chinese-LLaMA tokenizer
os.makedirs(output_sp_dir,exist_ok=True)
with open(output_sp_dir+'/chinese_llama.model', 'wb') as f:
    f.write(llama_spm.SerializeToString())
tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/chinese_llama.model')

tokenizer.save_pretrained(output_hf_dir)
print(f"Chinese-LLaMA tokenizer has been saved to {output_hf_dir}")

llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)
chinese_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)
print(chinese_llama_tokenizer.all_special_tokens)
print(chinese_llama_tokenizer.all_special_ids)
print(chinese_llama_tokenizer.special_tokens_map)
text='''白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
The primary use of LLaMA is research on large language models, including'''
print("Test text:\n",text)
print
print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}")
print(f"Tokenized by Chinese-LLaMA tokenizer:{chinese_llama_tokenizer.tokenize(text)}")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


32000 20000
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
32000
Before:32000
New model pieces: 49953
Chinese-LLaMA tokenizer has been saved to merged_tokenizer_hf
['<s>', '</s>', '<unk>']
[1, 2, 0]
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}
Test text:
 白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
The primary use of LLaMA is research on large language models, including
Tokenized by LLaMA tokenizer:['▁', '白', '日', '<0xE4>', '<0xBE>', '<0x9D>', '山', '<0xE5>', '<0xB0>', '<0xBD>', '，', '黄', '河', '入', '海', '流', '。', '<0xE6>', '<0xAC>', '<0xB2>', '<0xE7>', '<0xA9>', '<0xB7>', '千', '里', '目', '，', '更', '上', '一', '<0xE5>', '<0xB1>', '<0x82>', '<0xE6>', '<0xA5>', '<0xBC>', '。', '<0x0A>', 'The', '▁primary', '▁use', '▁of', '▁L', 'La', 'MA', '▁is', '▁research', '▁on', '▁large', '▁language', '▁models', ',', '▁including']
Tokenized by Chinese-LLaMA tokenizer:['▁白', '日', '依', '山', '尽', '，', '黄河', '入', '海', '流', '。', '欲', '穷', '千里', '目', '，', '更', '上

### 使用SentencePiece进行Tokenizer的训练

In [1]:
import sentencepiece as spm

# Train SentencePiece model
spm.SentencePieceTrainer.train('--input=./data/mr_fujino/mr_fujino.txt --model_prefix=m --vocab_size=2000 --model_type=bpe')

# Load SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# Test encode and decode
encoded = sp.encode_as_pieces('你好，世界！一个美好的世界')
print(encoded)
decoded = sp.decode_pieces(encoded)
print(decoded)

['▁', '你', '好', ',', '世界', '!', '一个', '美', '好', '的', '世界']
你好,世界!一个美好的世界


sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=./data/mr_fujino/mr_fujino.txt --model_prefix=m --vocab_size=2000 --model_type=bpe
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./data/mr_fujino/mr_fujino.txt
  input_format: 
  model_prefix: m
  model_type: BPE
  vocab_size: 2000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
