<a href="https://colab.research.google.com/github/vrjayaprakash/Model/blob/main/SentenceTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer,util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string']

embeddings = model.encode(sentences)

for sentence, embedding in zip(sentences, embeddings):
  print("Sentence:", sentence)
  print("Embedding:", embedding)
  print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173440e-02 -4.28515524e-02 -1.56286098e-02  1.40537424e-02
  3.95538136e-02  1.21796273e-01  2.94333436e-02 -3.17523964e-02
  3.54959816e-02 -7.93140158e-02  1.75878443e-02 -4.04369906e-02
  4.97259386e-02  2.54911911e-02 -7.18700886e-02  8.14969018e-02
  1.47071364e-03  4.79626842e-02 -4.50336263e-02 -9.92174745e-02
 -2.81770118e-02  6.45046607e-02  4.44670692e-02 -4.76217195e-02
 -3.52952518e-02  4.38671783e-02 -5.28566279e-02  4.33054753e-04
  1.01921476e-01  1.64072346e-02  3.26996781e-02 -3.45987007e-02
  1.21339252e-02  7.94871002e-02  4.58342349e-03  1.57778263e-02
 -9.68209095e-03  2.87626106e-02 -5.05806133e-02 -1.55793801e-02
 -2.87906900e-02 -9.62282531e-03  3.15556824e-02  2.27349177e-02
  8.71449634e-02 -3.85027453e-02 -8.84718448e-02 -8.75500590e-03
 -2.12342944e-02  2.08923575e-02 -9.02077258e-02 -5.25732450e-02
 -1.05638755e-02  2.88310610e-02 -1.61455031e-02  6.17836649e-03
 -1.23234

In [4]:
emb1 = model.encode("I am eating apple")
emb2 = model.encode("I like fruits")
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine_similarity:", cos_sim)

Cosine_similarity: tensor([[0.5398]])


In [5]:
sentences = ['A man is eating food.',
             'A man is eating a peice of bread.',
             'The girl is carrying a baby.',
             'A man is riding a horse.',
             'A woman is playing violin.'
             'Two men pushed carts through the woods.',
             'A man is riding a white horse on an enclosed ground.',
             'A monkey is playing drums.',
             'Someone in a gorilla costume is playing a set of drums.'
             ]

embeddings = model.encode(sentences)

cos_sim = util.cos_sim(embeddings, embeddings)

cos_sim

tensor([[ 1.0000,  0.7850, -0.1050,  0.2474, -0.1003,  0.1707,  0.0476,  0.0630],
        [ 0.7850,  1.0000, -0.0715,  0.1678, -0.1159,  0.1367,  0.0418,  0.0069],
        [-0.1050, -0.0715,  1.0000, -0.1088, -0.0032, -0.0928,  0.0231,  0.0247],
        [ 0.2474,  0.1678, -0.1088,  1.0000, -0.0252,  0.7369,  0.0821,  0.1389],
        [-0.1003, -0.1159, -0.0032, -0.0252,  1.0000, -0.0067,  0.1577,  0.2015],
        [ 0.1707,  0.1367, -0.0928,  0.7369, -0.0067,  1.0000,  0.0495,  0.1191],
        [ 0.0476,  0.0418,  0.0231,  0.0821,  0.1577,  0.0495,  1.0000,  0.6433],
        [ 0.0630,  0.0069,  0.0247,  0.1389,  0.2015,  0.1191,  0.6433,  1.0000]])

In [6]:
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
  for j in range(i+1, len(cos_sim)):
    all_sentence_combinations.append((cos_sim[i][j], i ,j))
all_sentence_combinations


[(tensor(0.7850), 0, 1),
 (tensor(-0.1050), 0, 2),
 (tensor(0.2474), 0, 3),
 (tensor(-0.1003), 0, 4),
 (tensor(0.1707), 0, 5),
 (tensor(0.0476), 0, 6),
 (tensor(0.0630), 0, 7),
 (tensor(-0.0715), 1, 2),
 (tensor(0.1678), 1, 3),
 (tensor(-0.1159), 1, 4),
 (tensor(0.1367), 1, 5),
 (tensor(0.0418), 1, 6),
 (tensor(0.0069), 1, 7),
 (tensor(-0.1088), 2, 3),
 (tensor(-0.0032), 2, 4),
 (tensor(-0.0928), 2, 5),
 (tensor(0.0231), 2, 6),
 (tensor(0.0247), 2, 7),
 (tensor(-0.0252), 3, 4),
 (tensor(0.7369), 3, 5),
 (tensor(0.0821), 3, 6),
 (tensor(0.1389), 3, 7),
 (tensor(-0.0067), 4, 5),
 (tensor(0.1577), 4, 6),
 (tensor(0.2015), 4, 7),
 (tensor(0.0495), 5, 6),
 (tensor(0.1191), 5, 7),
 (tensor(0.6433), 6, 7)]

In [7]:
all_sentence_combinations = sorted(all_sentence_combinations, key =lambda x : x[0], reverse = True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
  print("{} \t {} \t {: .4f}".format(sentences[i], sentences[j], cos_sim[i][j]))


Top-5 most similar pairs:
A man is eating food. 	 A man is eating a peice of bread. 	  0.7850
A man is riding a horse. 	 A man is riding a white horse on an enclosed ground. 	  0.7369
A monkey is playing drums. 	 Someone in a gorilla costume is playing a set of drums. 	  0.6433
A man is eating food. 	 A man is riding a horse. 	  0.2474
A woman is playing violin.Two men pushed carts through the woods. 	 Someone in a gorilla costume is playing a set of drums. 	  0.2015


In [8]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('clips/mfaq')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/778 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
question = "<Q>How many models can I host on HuggingFace?"
answer_1 = "<A>All plans come with unlimited private models and datasets."
answer_2 = "<A>AutoNLP is an automatic way to train and deploy state-of-the-art NLP models, seamlessly integrated with the Hugging Face ecosystem."
answer_3 = "<A>Based on how much training data and model variants are created, we send you a compute cost and payment link - as low as $10 per job."

query_embedding = model.encode(question)
corpus_embeddings = model.encode([answer_1, answer_2, answer_3])

print(util.semantic_search(query_embedding, corpus_embeddings))

[[{'corpus_id': 0, 'score': 0.564632773399353}, {'corpus_id': 2, 'score': 0.5142343640327454}, {'corpus_id': 1, 'score': 0.47300419211387634}]]


In [10]:
from transformers import pipeline

In [11]:
qa_model = pipeline("question-answering")
question = "How many models can I host on HuggingFace?"
context = "All plans come with unlimited private models and datasets."

qa_model(question = question, context = context)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


{'score': 0.701718270778656, 'start': 20, 'end': 29, 'answer': 'unlimited'}

In [12]:
from sklearn.cluster import KMeans
import numpy as np

In [13]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [14]:
corpus =  ['A man is eating food.',
          'A man is eating a piece of bread.',
          'Horse is eating grass.',
          'A man is eating pasta.',
          'A Woman is eating Biryani.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.',
          'The cheetah is chasing a man who is riding the horse.',
          'man and women with their baby are watching cheetah in zoo'
          ]

corpus_embeddings = embedder.encode(corpus)

corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis = 1, keepdims = True)



In [15]:
corpus_embeddings[0]

array([ 3.32415663e-02,  4.40607499e-03, -6.27700286e-03,  4.83787432e-02,
       -1.38702288e-01, -3.36174220e-02,  1.01128325e-01, -5.43849692e-02,
       -4.32477221e-02, -3.99410948e-02,  7.78632425e-03, -1.27489083e-02,
       -6.68302402e-02, -1.73866116e-02,  4.74505909e-02, -5.77242747e-02,
        1.01888388e-01, -9.11660900e-04,  8.22614655e-02, -5.03415987e-02,
        6.77303076e-02,  4.08765301e-02, -3.58018763e-02, -1.00682445e-01,
       -6.69356575e-03, -5.31686135e-02,  1.00335166e-01, -5.46136349e-02,
       -2.28481777e-02,  1.38387177e-02,  7.48658255e-02, -6.17880449e-02,
        6.39215931e-02,  1.62387174e-02, -5.32299168e-02, -3.86083834e-02,
        3.15276086e-02, -8.11529905e-02, -3.31432223e-02, -5.38496650e-04,
       -3.96068720e-03, -1.52733978e-02, -9.86401457e-04,  9.57987681e-02,
       -5.42920157e-02,  1.84571967e-02, -1.07143618e-01,  1.38884932e-02,
        3.94072458e-02, -2.69243810e-02, -9.15989354e-02, -1.14195086e-02,
        3.38137336e-02, -

In [16]:
cluster_model = KMeans(n_clusters = 4)
cluster_model.fit(corpus_embeddings)
cluster_assignment = cluster_model.labels_

In [17]:
print(cluster_assignment)

[1 1 3 1 1 2 2 3 3 0 3 0 0 0 0]


In [18]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
  if cluster_id not in clustered_sentences:
    clustered_sentences[cluster_id] = []

  clustered_sentences[cluster_id].append(corpus[sentence_id])

clustered_sentences


{np.int32(1): ['A man is eating food.',
  'A man is eating a piece of bread.',
  'A man is eating pasta.',
  'A Woman is eating Biryani.'],
 np.int32(3): ['Horse is eating grass.',
  'A man is riding a horse.',
  'A man is riding a white horse on an enclosed ground.',
  'Someone in a gorilla costume is playing a set of drums.'],
 np.int32(2): ['The girl is carrying a baby.',
  'The baby is carried by the woman'],
 np.int32(0): ['A monkey is playing drums.',
  'A cheetah is running behind its prey.',
  'A cheetah chases prey on across a field.',
  'The cheetah is chasing a man who is riding the horse.',
  'man and women with their baby are watching cheetah in zoo']}