In [1]:
import gensim.downloader

In [2]:
list(gensim.downloader.info()['models'].keys())

['fasttext-wiki-news-subwords-300',
 'conceptnet-numberbatch-17-06-300',
 'word2vec-ruscorpora-300',
 'word2vec-google-news-300',
 'glove-wiki-gigaword-50',
 'glove-wiki-gigaword-100',
 'glove-wiki-gigaword-200',
 'glove-wiki-gigaword-300',
 'glove-twitter-25',
 'glove-twitter-50',
 'glove-twitter-100',
 'glove-twitter-200',
 '__testing_word2vec-matrix-synopsis']

In [3]:
w2v_vectors = gensim.downloader.load('word2vec-google-news-300')

In [5]:
compare_words = ['say', 'says', 'said', 'tell', 'told', 'state', 'stated', 'tree']

distances = dict(zip(compare_words, (w2v_vectors.similarity('say', word) for word in compare_words)))

distances

{'say': 1.0000001,
 'says': 0.38469434,
 'said': 0.40412587,
 'tell': 0.6163529,
 'told': 0.34618792,
 'state': 0.17350683,
 'stated': 0.21777253,
 'tree': 0.09066129}

In [None]:
w2v_vectors['acorn']

In [None]:
w2v_vectors.get("acorn")

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model_name = "sentence-transformers/gtr-t5-large"

In [None]:
model = SentenceTransformer(model_name)

In [None]:
test_sentence = "i walk the line"

In [None]:
output_values = model.encode(test_sentence, output_value=None)
output_values

In [None]:
{k: v.size() for k,v in output_values.items()}

In [None]:
model.tokenize(test_sentence)['input_ids'].size()

In [None]:
model.tokenizer(test_sentence).tokens()

In [125]:
import spacy
import numpy as np

In [126]:
nlp = spacy.blank("en")
config = {
    "model": {
        "@architectures": "spacy-transformers.TransformerModel.v3",
        # "id": "roberta-base",
        # "id": "sentence-transformers/gtr-t5-large",
        "id": "intfloat/e5-base",
        "tokenizer_config": {"use_fast": True},
        "transformer_config": {"output_attentions": True, "output_hidden_states": True},
        # "mixed_precision": True,
        # "grad_scaler_config": {"init_scale": 32768}
    }
}
trf = nlp.add_pipe("transformer", config=config)

In [127]:
trf.model.initialize()

<spacy_transformers.layers.transformer_model.TransformerModel at 0x159b00280>

In [128]:
trf

<spacy_transformers.pipeline_component.Transformer at 0x159ff6140>

In [129]:
doc = nlp("Don't go into the devil's H.Q.")
print([rf"{s}" for s in doc])

['Do', "n't", 'go', 'into', 'the', 'devil', "'s", 'H.Q.']


In [130]:
trf_data = doc._.trf_data
trf_data

TransformerData(wordpieces=WordpieceBatch(strings=[['[CLS]', 'don', "'", 't', 'go', 'into', 'the', 'devil', "'", 's', 'h', '.', 'q', '.', '[SEP]']], input_ids=array([[ 101, 2123, 1005, 1056, 2175, 2046, 1996, 6548, 1005, 1055, 1044,
        1012, 1053, 1012,  102]], dtype=int32), attention_mask=array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]],
      dtype=float32), lengths=[15], token_type_ids=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)), model_output=ModelOutput([('last_hidden_state', array([[[-0.6071745 ,  0.49781385, -0.48421204, ...,  0.8948137 ,
         -0.11013687,  1.658252  ],
        [-0.6261945 ,  0.58509636, -0.38874942, ...,  0.7306586 ,
          0.190015  ,  1.2604722 ],
        [-0.5573779 ,  0.77217454, -0.37049142, ...,  0.950989  ,
          0.21684366,  1.7303813 ],
        ...,
        [-0.6385277 ,  0.6695092 , -0.4624164 , ...,  0.8396056 ,
          0.13702023,  1.4202889 ],
        [-0.46596324,  0.39745423, -0.423974

In [131]:
print(list(trf_data.wordpieces.strings[0]))

['[CLS]', 'don', "'", 't', 'go', 'into', 'the', 'devil', "'", 's', 'h', '.', 'q', '.', '[SEP]']


In [132]:
len(trf_data.wordpieces.strings[0])

15

In [133]:
len(doc)

8

In [134]:
trf_data.align

Ragged(data=array([[ 1],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12],
       [13]], dtype=int32), lengths=array([1, 3, 1, 1, 1, 1, 2, 4], dtype=int32), data_shape=(-1,), starts_ends=None)

In [135]:
trf_data.align.data.shape

(14, 1)

In [136]:
list(trf_data.align.dataXd)

[1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [137]:
trf_data.align.lengths.shape

(8,)

In [138]:
np.sum(trf_data.align.lengths)

14

In [139]:
list(trf_data.align.lengths)

[1, 3, 1, 1, 1, 1, 2, 4]

In [140]:
len(list(trf_data.align.lengths))

8

In [141]:
len(list(doc))

8

In [142]:
trf_data.model_output.last_hidden_state.shape

(1, 15, 768)

In [143]:
trf_data.model_output.pooler_output.shape

(1, 768)

In [144]:
doc[0].has_vector

False

In [145]:
from thinc.types import Ragged

# creating token alignment between lexical tokens (spacy default tokenizer)
# and transformer tokenizer (wordpiece, sentencepiece, etc.)

# mapping from lexical token idx to list of transformer token idxes

lexical_tokens = list(doc)

start_marker_token = '[CLS]'
end_marker_token = '[SEP]'

transformer_tokens = trf_data.wordpieces.strings[0]
# if transformer_tokens[0] == start_marker_token:
#     transformer_tokens = transformer_tokens[1:]
# if transformer_tokens[-1] == end_marker_token:
#     transformer_tokens = transformer_tokens[:-1]


lex2trf_idx: dict[int, list[int]] = {}

# trf_idx: int = 0
# for lex_idx in range(len(lexical_tokens)):
#     trf_token_length: int = trf_data.align.lengths[lex_idx]
#     lex2trf_idx[lex_idx] = trf_data.align.dataXd[trf_idx:trf_idx + trf_token_length]
#     trf_idx += trf_token_length


def get_trf_token_idxes(lex_idx_: int, alignment_data: Ragged):
    start_idx: int = int(np.sum(alignment_data.lengths[:lex_idx_]))
    end_idx: int = start_idx + alignment_data.lengths[lex_idx_]
    return alignment_data.dataXd[start_idx:end_idx]

lex2trf_idx = {lex_idx: get_trf_token_idxes(lex_idx, trf_data.align) for lex_idx in range(len(lexical_tokens))}

# print(lex2trf_idx)
print(lex2trf_idx)


lex2trf_tok: dict[str, list[str]] = {lexical_tokens[lex_idx]: [transformer_tokens[trf_idx] for trf_idx in list(trf_idxes)]
                                     for lex_idx, trf_idxes in lex2trf_idx.items()}
lex2trf_tok



{0: array([1], dtype=int32), 1: array([1, 2, 3], dtype=int32), 2: array([4], dtype=int32), 3: array([5], dtype=int32), 4: array([6], dtype=int32), 5: array([7], dtype=int32), 6: array([8, 9], dtype=int32), 7: array([10, 11, 12, 13], dtype=int32)}


{Do: ['don'],
 n't: ['don', "'", 't'],
 go: ['go'],
 into: ['into'],
 the: ['the'],
 devil: ['devil'],
 's: ["'", 's'],
 H.Q.: ['h', '.', 'q', '.']}

In [146]:
test_doc = nlp("Hello this is a tomato")
test_doc

Hello this is a tomato

In [147]:
def spay_doc_to_str_list(spacy_doc):
    return [str(tok) for tok in spacy_doc]

spay_doc_to_str_list(test_doc)

['Hello', 'this', 'is', 'a', 'tomato']

In [148]:
from spacy.tokens import Doc

tmp_doc = Doc(nlp.vocab, words=["asdfasdf", "asdfasf", "käse"])

In [149]:
trf(tmp_doc)

asdfasdf asdfasf käse 

In [150]:
tmp_doc._.trf_data

TransformerData(wordpieces=WordpieceBatch(strings=[['[CLS]', 'as', '##df', '##as', '##df', 'as', '##df', '##as', '##f', 'ka', '##se', '[SEP]']], input_ids=array([[  101,  2004, 20952,  3022, 20952,  2004, 20952,  3022,  2546,
        10556,  3366,   102]], dtype=int32), attention_mask=array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], dtype=float32), lengths=[12], token_type_ids=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)), model_output=ModelOutput([('last_hidden_state', array([[[-0.9446683 ,  0.36191848, -0.21284342, ...,  0.6667042 ,
         -0.14674456,  0.3264749 ],
        [-0.9064548 ,  0.8075475 , -0.11093315, ...,  0.62682796,
          0.20289385,  0.443856  ],
        [-0.84973395,  0.6919283 ,  0.00776123, ...,  0.70527303,
          0.10297091,  0.35144413],
        ...,
        [-1.1138823 ,  0.2287293 , -0.23094183, ...,  0.841911  ,
          0.03005426, -0.3098214 ],
        [-0.62520885,  0.09287652, -0.24744985, ...,  0.7510296 ,
          0.104

In [151]:
spay_doc_to_str_list(tmp_doc)

['asdfasdf', 'asdfasf', 'käse']

In [152]:
len(doc._.trf_data.tensors)

4

In [153]:
doc._.trf_data.tensors[0]

array([[[-0.6071745 ,  0.49781385, -0.48421204, ...,  0.8948137 ,
         -0.11013687,  1.658252  ],
        [-0.6261945 ,  0.58509636, -0.38874942, ...,  0.7306586 ,
          0.190015  ,  1.2604722 ],
        [-0.5573779 ,  0.77217454, -0.37049142, ...,  0.950989  ,
          0.21684366,  1.7303813 ],
        ...,
        [-0.6385277 ,  0.6695092 , -0.4624164 , ...,  0.8396056 ,
          0.13702023,  1.4202889 ],
        [-0.46596324,  0.39745423, -0.4239746 , ...,  0.97151417,
         -0.05619294,  1.6306806 ],
        [-0.48803407,  0.78188735, -0.37230065, ...,  1.181527  ,
          0.328764  ,  1.8286595 ]]], dtype=float32)

In [154]:
import thinc

In [155]:
nlp = spacy.blank("en")
config = {
    "model": {
        "@architectures": "spacy-transformers.Tok2VecTransformer.v3",
        # "id": "roberta-base",
        # "id": "sentence-transformers/gtr-t5-large",
        "get_spans": "spacy-transformers.get_doc_spans.v1",
        "pooling": {"@layers": "reduce_mean.v1"},
        "id": "intfloat/e5-base",
        "tokenizer_config": {"use_fast": True},
        "transformer_config": {"output_attentions": True},
        # "mixed_precision": True,
        # "grad_scaler_config": {"init_scale": 32768}
    }
}
trf2vec = nlp.add_pipe("transformer", config=config)

In [156]:
trf2vec.model.initialize()

<thinc.model.Model at 0x159363ec0>

In [157]:
trf2vec("Hello this is a tomato")

TypeError: 'str' object is not callable