In [None]:
# %pip install transformers pandas scipy numpy

In [4]:
from transformers import BertModel, AutoTokenizer
import pandas as pd

In [5]:
model_name = "bert-base-cased"

In [6]:
model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

## Tokenizers

In [7]:
sentence = "When life gives you lemons,don't make lemonade."

In [8]:
tokens = tokenizer.tokenize(sentence)
tokens

['When',
 'life',
 'gives',
 'you',
 'lemon',
 '##s',
 ',',
 'don',
 "'",
 't',
 'make',
 'lemon',
 '##ade',
 '.']

In [9]:
vocab = tokenizer.vocab
vocab_df = pd.DataFrame({"token":vocab.keys(),"token_id":vocab.values()})
vocab_df = vocab_df.sort_values(by="token_id").set_index("token_id")
vocab_df

Unnamed: 0_level_0,token
token_id,Unnamed: 1_level_1
0,[PAD]
1,[unused1]
2,[unused2]
3,[unused3]
4,[unused4]
...,...
28991,##）
28992,##，
28993,##－
28994,##／


In [10]:
token_ids = tokenizer.encode(sentence)
token_ids

[101,
 1332,
 1297,
 3114,
 1128,
 22782,
 1116,
 117,
 1274,
 112,
 189,
 1294,
 22782,
 6397,
 119,
 102]

In [11]:
print(len(tokens),"-",len(token_ids))

14 - 16


In [12]:
vocab_df.iloc[101]

token    [CLS]
Name: 101, dtype: object

In [13]:
vocab_df.iloc[102]


token    [SEP]
Name: 102, dtype: object

In [14]:
list(zip(tokens, token_ids[1:-1]))

[('When', 1332),
 ('life', 1297),
 ('gives', 3114),
 ('you', 1128),
 ('lemon', 22782),
 ('##s', 1116),
 (',', 117),
 ('don', 1274),
 ("'", 112),
 ('t', 189),
 ('make', 1294),
 ('lemon', 22782),
 ('##ade', 6397),
 ('.', 119)]

In [15]:
tokenizer.decode(token_ids[1:-1])

"When life gives you lemons, don't make lemonade."

In [16]:
tokenizer_out = tokenizer(sentence)
tokenizer_out

{'input_ids': [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
sentence2 = sentence.replace("don't","")

In [18]:
tokenizer_out2 = tokenizer([sentence,sentence2],padding = True)
# padding = True will add padding tokens to make the input the same length. used when no of sentences are even
tokenizer_out2

{'input_ids': [[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 119, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [19]:
tokenizer.decode(tokenizer_out2["input_ids"][0])


"[CLS] When life gives you lemons, don't make lemonade. [SEP]"

In [20]:
tokenizer.decode(tokenizer_out2["input_ids"][1])

'[CLS] When life gives you lemons, make lemonade. [SEP] [PAD] [PAD] [PAD]'

## Word embeddings

In [None]:

from scipy.spatial.distance import cosine

In [43]:
text = "Tokenize me this please"
encoded_inputs = tokenizer(text, return_tensors='pt')

In [44]:
output = model(**encoded_inputs)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 6.3545e-01,  1.7615e-01,  6.1589e-01,  ..., -1.1664e-01,
           5.5535e-01, -2.4577e-01],
         [ 2.6871e-01,  1.0804e-01, -7.9291e-02,  ...,  8.8400e-02,
           8.2027e-01, -7.8417e-04],
         [ 5.9982e-02,  5.7306e-01, -2.6445e-01,  ...,  2.0249e-01,
          -8.9708e-01,  2.3878e-01],
         ...,
         [-1.6705e-01,  3.0571e-01,  4.3537e-01,  ...,  8.0311e-02,
           2.3093e-01,  1.8002e-01],
         [ 4.5541e-01,  1.2220e-01,  5.8000e-01,  ...,  3.0526e-01,
           5.1684e-01, -2.6089e-01],
         [ 7.1148e-01,  3.9665e-02,  3.6133e-01,  ..., -4.8748e-01,
           5.2402e-01, -8.0548e-01]]], grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.7571,  0.5197,  0.9999, -0.9967,  0.9707,  0.9701,  0.9926, -0.9949,
         -0.9859, -0.6731,  0.9896,  0.9993, -0.9988, -0.9999,  0.8604, -0.9850,
          0.9931, -0.5585, -1.0000, -0.5781, -0.6435, -0.9999,  0.1552,  0.976

In [45]:
last_hidden_state = output.last_hidden_state
pooler_output = output.pooler_output

In [46]:
last_hidden_state.shape

torch.Size([1, 8, 768])

In [47]:
pooler_output.shape

torch.Size([1, 768])

In [48]:
def predict(text):
    encoded_inputs = tokenizer(text, return_tensors='pt')
    return model(**encoded_inputs)[0]

In [49]:
sentence1 = "There was a fly drinking from the soup"
sentence2 = "To become a commercial pilot, he had to fly for 1500 hours"

tokens1 = tokenizer.tokenize(sentence1)
tokens2 = tokenizer.tokenize(sentence2)

out1 = predict(sentence1)
out2 = predict(sentence2) 

emb1 = out1[0:,tokens1.index("fly"),:].detach()
emb2 = out1[0:,tokens2.index("fly"),:].detach()

In [50]:
emb1.shape, emb2.shape

(torch.Size([1, 768]), torch.Size([1, 768]))

In [52]:
cosine(emb1.numpy().flatten(), emb2.numpy().flatten())


0.7663925588130951

## Masked Language Modeling

In [53]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from scipy.special import softmax
import numpy as np

In [54]:
model = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForMaskedLM.from_pretrained(model)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
mask = tokenizer.mask_token
sentence = f"I want to {mask} pizza for tonight."

tokens = tokenizer.tokenize(sentence)
encoded_inputs = tokenizer(sentence, return_tensors='pt')
output = model(**encoded_inputs)
logits = output.logits.detach().numpy()[0]

mask_logits = logits[tokens.index(mask)+1]
confidence_score = softmax(mask_logits)

for i in np.argsort(confidence_score)[::-1][:5]:
    pred_token = tokenizer.decode(i)
    score = confidence_score[i]
    
    # print(f"{pred_token} - {score:.2f}")
    print(sentence.replace(mask,pred_token),"-",score)

I want to have pizza for tonight. - 0.257289
I want to get pizza for tonight. - 0.17849614
I want to eat pizza for tonight. - 0.15555531
I want to make pizza for tonight. - 0.11422412
I want to order pizza for tonight. - 0.09823137


## Semantic Search Index

In [60]:
%pip install datasets sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Collecting nltk (from sentence-transformers)
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp311-cp311-macosx_11_0_arm64.whl (1.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m31m17.7 MB/s[0m eta [36m0:00:01[0m
Collecting click (from nltk->sentence-transformers)
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [68]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import torch

In [62]:
dataset = load_dataset("multi_news", split="test")
df = dataset.to_pandas().sample(2000, random_state=42)


Downloading data:   0%|          | 0.00/295M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/44972 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5622 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5622 [00:00<?, ? examples/s]

In [63]:
model = SentenceTransformer('all-MiniLM-L6-v2')
passage_embeddings = list(model.encode(df["summary"].to_list(),show_progress_bar=True))
passage_embeddings[0].shape

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

(384,)

In [64]:
query = "Find me some articles about technology and artificial intelligence"
query_embedding = model.encode(query)
query_embedding.shape

(384,)

In [67]:
similarities = util.cos_sim(query_embedding, passage_embeddings)
similarities.shape

  b = torch.tensor(b)


torch.Size([1, 2000])

In [71]:
top_indices = torch.topk(similarities.flatten(), 3).indices
top_indices

tensor([1266, 1834, 1612])

In [78]:
top_relevent_passages = [df.iloc[x.item()]["summary"][:200] +"..." for x in top_indices]
top_relevent_passages

['– Are you a "digital native" or a "digital immigrant," and does it make a difference? Research recently published in the Teaching and Teacher Education journal indicates the concept of so-called digit...',
 "– Using methods borrowed from Google, a group of researchers has analyzed all Wikipedia pages and determined that, at least on the English language version of the site, Frank Sinatra is the world's mo...",
 '– The "tech surge" to fix HealthCare.gov includes some names from the industry\'s biggest players. Among them, per a Health department blog post, is Michael Dickerson, on leave from his job as a site r...']

In [79]:
def find_relavant_news(query):
    query_embedding = model.encode(query)
    similarities = util.cos_sim(query_embedding, passage_embeddings)
    top_indices = torch.topk(similarities.flatten(), 3).indices
    top_relevent_passages = [df.iloc[x.item()]['summary'][:200] + "..." for x in top_indices]
    
    return top_relevent_passages

In [80]:
find_relavant_news("Natural disaster")

['– Hopes are fading for people still believed to be trapped under mud and debris after a massive landslide in Washington state. At least eight people have now been confirmed dead from the mudslide that...',
 '– A sad milestone out of Japan: Two weeks after the quake struck, its official death toll has broken the 10,000 mark—and that number is still on the rise, with more than 17,400 missing. Police estimat...',
 '– A Haitian Red Cross official estimated today that 45,000 to 50,000 people perished in the shattering earthquake Tuesday, as President Obama pledged US support of $100 million for what he said is lik...']