<a href="https://colab.research.google.com/github/zetavg/LLM-Research/blob/917fdd0/TW_Pythia_Embedding_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TW Pythia Embedding Similarity Checking

In [1]:
try:
    import transformers, accelerate, bitsandbytes
except:
    !pip install transformers accelerate bitsandbytes


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
# @title TW Pythia Model and Tokenizer 

import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

low_gpu_ram_usage = True
if low_gpu_ram_usage:
    if 'model_without_zhtw_tokens_training' in globals():
        del model_without_zhtw_tokens_training
    gc.collect()
    with torch.no_grad():
        torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained('twlm/tw-pythia-6.9b-chat-v0_2')
model = AutoModelForCausalLM.from_pretrained(
    'twlm/tw-pythia-6.9b-chat-v0_2',
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map={'': 'cuda'},
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
# @title Function: get_embedding

if 'model' not in globals():
    model = None
if 'tokenizer' not in globals():
    tokenizer = None

def get_embedding(word, tokenizer=tokenizer, model=model):
    ids = tokenizer.encode(word, return_tensors="pt")[0].to('cuda')
    if ids.shape[0] > 1:
        raise ValueError(f"The word '{word}' is consisted with more than 1 token: {ids}.")
    input_embeddings = model.get_input_embeddings()(ids)
    return input_embeddings[0]

# Sample
try:
    get_embedding(' cat')
except Exception:
    pass

In [4]:
# @title Function: get_cosine_similarity

import torch
from torch.nn.functional import cosine_similarity

def get_cosine_similarity(tensor_1, tensor_2):
    similarity = cosine_similarity(tensor_1.unsqueeze(0), tensor_2.unsqueeze(0))
    return similarity.item()


# Sample
a = torch.tensor([   1,    0, -1   ], dtype=torch.float16).to('cuda')
b = torch.tensor([ 1.1,    0, -1   ], dtype=torch.float16).to('cuda')
c = torch.tensor([ 0.1, -0.1, -0.8 ], dtype=torch.float16).to('cuda')
d = torch.tensor([  -1,    0, 1    ], dtype=torch.float16).to('cuda')
print('Cosine similarity of a and b: ', get_cosine_similarity(a, b))
print('Cosine similarity of a and c: ', get_cosine_similarity(a, c))
print('Cosine similarity of a and d: ', get_cosine_similarity(a, d))

Cosine similarity of a and b:  0.9990234375
Cosine similarity of a and c:  0.783203125
Cosine similarity of a and d:  -1.0


In [5]:
# @title Try TW Pythia 6.9B

In [6]:
get_cosine_similarity(
    get_embedding(' cat'),
    get_embedding(' dog'),
)

0.1405029296875

In [7]:
get_cosine_similarity(
    get_embedding('貓'),
    get_embedding('狗'),
)

0.0838623046875

In [8]:
get_cosine_similarity(
    get_embedding(' cat'),
    get_embedding('貓'),
)

0.0888671875

In [9]:
get_cosine_similarity(
    get_embedding(' dog'),
    get_embedding('狗'),
)

0.099853515625

In [10]:
get_cosine_similarity(
    get_embedding(' walk'),
    get_embedding('走'),
)

0.0711669921875

In [11]:
get_cosine_similarity(
    get_embedding(' cat'),
    get_embedding(' car'),
)

0.053192138671875

In [12]:
get_cosine_similarity(
    get_embedding('貓'),
    get_embedding('車'),
)

0.032257080078125

In [13]:
get_cosine_similarity(
    get_embedding(' dog'),
    get_embedding(' cold'),
)

0.0163726806640625

In [14]:
get_cosine_similarity(
    get_embedding('狗'),
    get_embedding('冷'),
)

0.024444580078125

In [15]:
get_cosine_similarity(
    get_embedding('狗'),
    get_embedding('皇'),
)

0.006008148193359375

In [16]:
get_cosine_similarity(
    get_embedding('貓'),
    get_embedding('皇'),
)

0.0190277099609375

In [17]:
get_cosine_similarity(
    get_embedding('貓'),
    get_embedding('喵'),
)

0.0156097412109375

In [18]:
get_cosine_similarity(
    get_embedding('狗'),
    get_embedding('喵'),
)

-0.01073455810546875

In [19]:
get_cosine_similarity(
    get_embedding('狗'),
    get_embedding('汪'),
)

0.041290283203125

In [20]:
get_cosine_similarity(
    get_embedding(' man'),
    get_embedding(' women'),
)

0.10235595703125

In [21]:
get_cosine_similarity(
    get_embedding(' human'),
    get_embedding(' women'),
)

0.047454833984375

In [22]:
get_cosine_similarity(
    get_embedding(' human'),
    get_embedding(' man'),
)

0.040191650390625

In [23]:
get_cosine_similarity(
    get_embedding(' she'),
    get_embedding(' woman'),
)

0.083984375

In [24]:
get_cosine_similarity(
    get_embedding(' she'),
    get_embedding(' man'),
)

0.06591796875

In [25]:
get_cosine_similarity(
    get_embedding(' he'),
    get_embedding(' woman'),
)

0.0201568603515625

In [26]:
get_cosine_similarity(
    get_embedding(' he'),
    get_embedding(' man'),
)

0.11614990234375

In [1]:
# @title Try Original Pythia 6.9B

import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

low_gpu_ram_usage = True
if low_gpu_ram_usage:
    if 'model' in globals():
        del model
    if 'model_without_zhtw_tokens_training' in globals():
        del model_without_zhtw_tokens_training
    gc.collect()
    with torch.no_grad():
        torch.cuda.empty_cache()

tokenizer = AutoTokenizer.from_pretrained('twlm/tw-pythia-6.9b-chat-v0_2')
model_without_zhtw_tokens_training = AutoModelForCausalLM.from_pretrained(
    'zetavg/pythia-6.9b',
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map={'': 'cuda'},
)
model_without_zhtw_tokens_training.resize_token_embeddings(tokenizer.vocab_size)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.10/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Embedding(58113, 4096)

In [4]:
get_cosine_similarity(
    get_embedding(' cat', model=model_without_zhtw_tokens_training),
    get_embedding(' dog', model=model_without_zhtw_tokens_training),
)

0.2183837890625

In [5]:
get_cosine_similarity(
    get_embedding('貓', model=model_without_zhtw_tokens_training),
    get_embedding('狗', model=model_without_zhtw_tokens_training),
)

-0.013519287109375

In [6]:
get_cosine_similarity(
    get_embedding(' cat', model=model_without_zhtw_tokens_training),
    get_embedding('貓', model=model_without_zhtw_tokens_training),
)

-0.016754150390625

In [7]:
get_cosine_similarity(
    get_embedding(' dog', model=model_without_zhtw_tokens_training),
    get_embedding('狗', model=model_without_zhtw_tokens_training),
)

-0.003566741943359375

In [8]:
get_cosine_similarity(
    get_embedding(' cat', model=model_without_zhtw_tokens_training),
    get_embedding(' car', model=model_without_zhtw_tokens_training),
)

0.09222412109375

In [9]:
get_cosine_similarity(
    get_embedding('貓', model=model_without_zhtw_tokens_training),
    get_embedding('車', model=model_without_zhtw_tokens_training),
)

0.0013141632080078125

In [10]:
get_cosine_similarity(
    get_embedding(' cold', model=model_without_zhtw_tokens_training),
    get_embedding(' dog', model=model_without_zhtw_tokens_training),
)

0.019439697265625

In [11]:
get_cosine_similarity(
    get_embedding('冷', model=model_without_zhtw_tokens_training),
    get_embedding('狗', model=model_without_zhtw_tokens_training),
)

0.010284423828125

In [12]:
get_cosine_similarity(
    get_embedding(' man', model=model_without_zhtw_tokens_training),
    get_embedding(' women', model=model_without_zhtw_tokens_training),
)

0.1494140625

In [13]:
get_cosine_similarity(
    get_embedding(' human', model=model_without_zhtw_tokens_training),
    get_embedding(' women', model=model_without_zhtw_tokens_training),
)

0.09619140625

In [14]:
get_cosine_similarity(
    get_embedding(' human', model=model_without_zhtw_tokens_training),
    get_embedding(' man', model=model_without_zhtw_tokens_training),
)

0.07476806640625

In [15]:
get_cosine_similarity(
    get_embedding(' she', model=model_without_zhtw_tokens_training),
    get_embedding(' women', model=model_without_zhtw_tokens_training),
)

0.1146240234375

In [16]:
get_cosine_similarity(
    get_embedding(' she', model=model_without_zhtw_tokens_training),
    get_embedding(' man', model=model_without_zhtw_tokens_training),
)

0.121337890625

In [17]:
get_cosine_similarity(
    get_embedding(' he', model=model_without_zhtw_tokens_training),
    get_embedding(' women', model=model_without_zhtw_tokens_training),
)

0.11053466796875

In [18]:
get_cosine_similarity(
    get_embedding(' he', model=model_without_zhtw_tokens_training),
    get_embedding(' man', model=model_without_zhtw_tokens_training),
)

0.1912841796875

In [1]:
# @title Try BLOOM-zh 

import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

low_gpu_ram_usage = True
if low_gpu_ram_usage:
    if 'model' in globals():
        del model
    if 'model_without_zhtw_tokens_training' in globals():
        del model_without_zhtw_tokens_training
    gc.collect()
    with torch.no_grad():
        torch.cuda.empty_cache()

bloom_zh_tokenizer = AutoTokenizer.from_pretrained('ckip-joint/bloom-1b1-zh')
bloom_zh_model = AutoModelForCausalLM.from_pretrained(
    'ckip-joint/bloom-1b1-zh',
    torch_dtype=torch.float16,
    load_in_8bit=False,
    device_map={'': 'cuda'},
)

In [6]:
get_cosine_similarity(
    get_embedding(' cat', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding(' dog', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.1624755859375

In [7]:
get_cosine_similarity(
    get_embedding('貓', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding('狗', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.337890625

In [8]:
get_cosine_similarity(
    get_embedding(' cat', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding('貓', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.076416015625

In [9]:
get_cosine_similarity(
    get_embedding(' dog', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding('狗', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.4228515625

In [10]:
get_cosine_similarity(
    get_embedding(' cat', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding(' car', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.30224609375

In [11]:
get_cosine_similarity(
    get_embedding('貓', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding('車', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.351806640625

In [15]:
get_cosine_similarity(
    get_embedding(' dog', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding(' cold', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.138427734375

In [16]:
get_cosine_similarity(
    get_embedding('狗', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
    get_embedding('冷', tokenizer=bloom_zh_tokenizer, model=bloom_zh_model),
)

0.138671875