In [14]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [15]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
paragraphs = [
    "Dog and cat are common house pets.",
    "Zoos have animals like zebras and wolves.",
    "I had a great day today.",
    "I really like animals a lot.",
    "Gibberish afadadf affafa afafa."
]

In [17]:
paragraph_embeddings = []

for paragraph in paragraphs:
    tokens = tokenizer(paragraph, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        # Averaging the token embeddings to represent the whole sentence/paragraph
        embedding = outputs.last_hidden_state.mean(dim=1).numpy()
        paragraph_embeddings.append(embedding)
        
# Convert to numpy array
paragraph_embeddings = np.vstack(paragraph_embeddings)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(paragraph_embeddings, paragraph_embeddings)


In [19]:
print("Similarity Matrix:")
for i in range(len(paragraphs)):
    for j in range(len(paragraphs)):
        if i != j:
            print(f"Similarity between:")
            print(f"  Paragraph {i + 1}: {paragraphs[i]}")
            print(f"  Paragraph {j + 1}: {paragraphs[j]}")
            print(f"  Score: {similarity_matrix[i][j]:.4f}")
            print("----")


Similarity Matrix:
Similarity between:
  Paragraph 1: Dog and cat are common house pets.
  Paragraph 2: Zoos have animals like zebras and wolves.
  Score: 0.7456
----
Similarity between:
  Paragraph 1: Dog and cat are common house pets.
  Paragraph 3: I had a great day today.
  Score: 0.5256
----
Similarity between:
  Paragraph 1: Dog and cat are common house pets.
  Paragraph 4: I really like animals a lot.
  Score: 0.6068
----
Similarity between:
  Paragraph 1: Dog and cat are common house pets.
  Paragraph 5: Gibberish afadadf affafa afafa.
  Score: 0.4532
----
Similarity between:
  Paragraph 2: Zoos have animals like zebras and wolves.
  Paragraph 1: Dog and cat are common house pets.
  Score: 0.7456
----
Similarity between:
  Paragraph 2: Zoos have animals like zebras and wolves.
  Paragraph 3: I had a great day today.
  Score: 0.5157
----
Similarity between:
  Paragraph 2: Zoos have animals like zebras and wolves.
  Paragraph 4: I really like animals a lot.
  Score: 0.6322
----
S