In [1]:
from transformers import AutoTokenizer, AutoModel, CanineTokenizer, CanineModel
from sentence_transformers import SentenceTransformer, util

from my_util import get_chunks, get_topk_similarity, show_tokens, get_batched_embeddings

  from .autonotebook import tqdm as notebook_tqdm


## Test embedders

In [2]:
default_query = "what are scope 1 emissions?"

In [3]:
content, metadata = get_chunks(company_name="novo_nordisk")

In [4]:
test_embeders = [
    SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v4", cache_folder="cache"),
    SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder="cache"),
    SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-dot-v1", cache_folder="cache"),
    SentenceTransformer("sentence-transformers/multi-qa-distilbert-cos-v1", cache_folder="cache"),
    SentenceTransformer('sentence-transformers/all-distilroberta-v1', cache_folder="cache"),
    SentenceTransformer("sentence-transformers/msmarco-distilbert-dot-v5", cache_folder="cache"),
    SentenceTransformer("sentence-transformers/msmarco-distilbert-base-tas-b", cache_folder="cache"),
    SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder="cache"),  # Default for RAG Redis
]

In [5]:
for test_embeder in test_embeders:
    print(test_embeder)

    embedded_query = test_embeder.encode(default_query)
    embedded_content = test_embeder.encode(content)

    get_topk_similarity(
        k=10, 
        encoded_query=embedded_query, 
        encoded_docs=embedded_content, 
        is_cos_sim=True, 
        debug=True,
    )

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)
Most similar pairs:
doc_idx	 score
260 	 0.5144
261 	 0.4494
19 	 0.4473
21 	 0.4090
20 	 0.3557
259 	 0.3184
306 	 0.3183
22 	 0.3165
258 	 0.2938
52 	 0.2917
SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)
Most similar pairs:
doc_idx	 score
260 	 0.3954
263 	 0.3939
306 	 0.3568
261 	 0.3469
265 	 0.3414
259 	 0.3407
262 	 0.3368
20 	 0.3340
55 	 0.2738
307 	 0.2691
SentenceT

<div style="background-color: honeydew">

## Why do embedders perform differently?
For how to recognize words and build vocabulary:
- Tokenization methods?
- Training data for tokenizers?

For context to embed meanings:
- Encoder architecture and params?
- Training data for encoders?

</div>

## Focus on tokenizers

`Word tokens`: This approach was common with earlier methods like `Word2Vec` but is being used less and less in NLP.
* One challenge with word tokenization is that the tokenizer is unable to deal with new words that enter the dataset.
* It also results in a vocabulary that has a lot of tokens with minimal differences between them (e.g., apology, apologize, apologetic, apologist).

`Subword tokens`: This method contains full and partial words. 
* In addition to the vocabulary diversity, another benefit of this approach is its ability to represent new words by breaking the new token down into smaller characters, which tend to be a part of the vocabulary.
* When compared to character tokens, this method benefits from the ability to fit more text within the limited context length of a Transformer model.

`Character tokens`: This is another method that is able to deal successfully with new words because it has the raw letters to fall-back on.

`Byte tokens`: One additional tokenization method breaks down tokens into the individual bytes that are used to represent unicode characters, such as `BPE (Byte-Pair Encoding)` is widely used by GPT models.


In [6]:
test_sentence = """
In 2022, Scope 1 emissions decreased by 1% compared to 2021 due to an increase 
in usage of renewable energy sources from 7.4 to 7.3 (1,000 tonnes CO2), and 
production sites consumed 3,918 thousand cubic metres of water less than last year's 1,345,340.
"""

In [7]:
subword_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-tas-b")

In [8]:
show_tokens(sentence=test_sentence, tokenizer=subword_tokenizer)

Number of tokens: 68
tensor(101) -> [CLS]
tensor(1999) -> in
tensor(16798) -> 202
tensor(2475) -> ##2
tensor(1010) -> ,
tensor(9531) -> scope
tensor(1015) -> 1
tensor(11768) -> emissions
tensor(10548) -> decreased
tensor(2011) -> by
tensor(1015) -> 1
tensor(1003) -> %
tensor(4102) -> compared
tensor(2000) -> to
tensor(25682) -> 2021
tensor(2349) -> due
tensor(2000) -> to
tensor(2019) -> an
tensor(3623) -> increase
tensor(1999) -> in
tensor(8192) -> usage
tensor(1997) -> of
tensor(13918) -> renewable
tensor(2943) -> energy
tensor(4216) -> sources
tensor(2013) -> from
tensor(1021) -> 7
tensor(1012) -> .
tensor(1018) -> 4
tensor(2000) -> to
tensor(1021) -> 7
tensor(1012) -> .
tensor(1017) -> 3
tensor(1006) -> (
tensor(1015) -> 1
tensor(1010) -> ,
tensor(2199) -> 000
tensor(11000) -> tonnes
tensor(2522) -> co
tensor(2475) -> ##2
tensor(1007) -> )
tensor(1010) -> ,
tensor(1998) -> and
tensor(2537) -> production
tensor(4573) -> sites
tensor(10202) -> consumed
tensor(1017) -> 3
tensor(1010) -

<div style="background-color: honeydew">

### Questions
- Do 1s in "1%" and "scope 1" have different embeddings, though they have the same token ID?
- Each number is a new word? How about number should be treated as composed of digit (character)?

</div>

In [9]:
char_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

In [10]:
show_tokens(sentence=test_sentence, tokenizer=char_tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Number of tokens: 256
tensor(13) -> 

tensor(76) -> I
tensor(113) -> n
tensor(35) ->  
tensor(53) -> 2
tensor(51) -> 0
tensor(53) -> 2
tensor(53) -> 2
tensor(47) -> ,
tensor(35) ->  
tensor(86) -> S
tensor(102) -> c
tensor(114) -> o
tensor(115) -> p
tensor(104) -> e
tensor(35) ->  
tensor(52) -> 1
tensor(35) ->  
tensor(104) -> e
tensor(112) -> m
tensor(108) -> i
tensor(118) -> s
tensor(118) -> s
tensor(108) -> i
tensor(114) -> o
tensor(113) -> n
tensor(118) -> s
tensor(35) ->  
tensor(103) -> d
tensor(104) -> e
tensor(102) -> c
tensor(117) -> r
tensor(104) -> e
tensor(100) -> a
tensor(118) -> s
tensor(104) -> e
tensor(103) -> d
tensor(35) ->  
tensor(101) -> b
tensor(124) -> y
tensor(35) ->  
tensor(52) -> 1
tensor(40) -> %
tensor(35) ->  
tensor(102) -> c
tensor(114) -> o
tensor(112) -> m
tensor(115) -> p
tensor(100) -> a
tensor(117) -> r
tensor(104) -> e
tensor(103) -> d
tensor(35) ->  
tensor(119) -> t
tensor(114) -> o
tensor(35) ->  
tensor(53) -> 2
tensor(51) -> 0
tensor(53) -> 2

## If character tokenizers perform better?

In [11]:
model = CanineModel.from_pretrained("google/canine-c")
tokenizer = CanineTokenizer.from_pretrained("google/canine-c")

In [12]:
embedded_docs = get_batched_embeddings(
    sentences=content, 
    batch_size=12, 
    tokenizer=tokenizer, 
    model=model, 
    padding="longest"
)

  4%|████▊                                                                                                                                  | 1/28 [00:27<12:12, 27.13s/it]

Shape of embedded tokens: torch.Size([12, 1522, 768])


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [07:55<00:00, 16.98s/it]


In [13]:
embedded_query = get_batched_embeddings(
    sentences=[default_query], 
    batch_size=12, 
    tokenizer=tokenizer, 
    model=model, 
    padding="longest"
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.95it/s]

Shape of embedded tokens: torch.Size([1, 29, 768])





In [14]:
get_topk_similarity(
    k=10, 
    encoded_query=embedded_query, 
    encoded_docs=embedded_docs, 
    is_cos_sim=True, 
    debug=True,
)

Most similar pairs:
doc_idx	 score
290 	 0.6896
297 	 0.6896
176 	 0.6860
288 	 0.6839
264 	 0.6829
291 	 0.6801
137 	 0.6748
308 	 0.6743
267 	 0.6715
207 	 0.6697
