<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/ComparisonUSEvsMPNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets
! pip install transformers

In [3]:
from typing import List
import random
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity as cs
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import tensorflow_hub as hub
pd.set_option('display.max_colwidth', None)

In [5]:
dataset = load_dataset("quora")

Using custom data configuration default


Downloading and preparing dataset quora/default (download: 55.48 MiB, generated: 55.46 MiB, post-processed: Unknown size, total: 110.94 MiB) to /root/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

Dataset quora downloaded and prepared to /root/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
pairs = []
count_p, count_n = 0, 0
for sample in dataset["train"]:
  texts = sample["questions"]["text"]
  if sample["is_duplicate"] and count_p < 2000 :
    count_p += 1
    pairs.append((texts[0], texts[1], 1))
  
  if not sample["is_duplicate"] and count_n < 2000:
    count_n += 1
    pairs.append((texts[0], texts[1], 0))
  
  if count_p == 2000 and count_p == 2000:
    break

assert len(pairs) == 4000, f"True size: {len(pairs)}"
random.shuffle(pairs)

In [8]:
tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model_mpnet = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model_mpnet.eval()

params = sum(p.numel() for p in model_mpnet.parameters() if p.requires_grad)
print(f"Trainable Parameters --> {params}")

model_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


Trainable Parameters --> 109486464


INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


In [9]:
def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_similarity_mpnet(sentences: List[str]) -> List[float]:
  encoded_input = tokenizer_mpnet(sentences, padding=True, truncation=True, return_tensors='pt')
  with torch.no_grad():
    model_output = model_mpnet(**encoded_input)

  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
  similarity = torch.mm(sentence_embeddings, sentence_embeddings.transpose(0, 1))[0].numpy().tolist()[1:]
  return similarity

def get_similarity_use(sentences: List[str]) -> List[float]:
  embeddings = model_use(sentences)
  embeddings = embeddings.numpy()
  similarity = cs(embeddings)[0][1:]
  return similarity

In [10]:
pairs_scores = []
for idx, pair in enumerate(pairs):
  if idx != 0 and (idx % 100) == 0:
    print(f"Count --> {idx}/{len(pairs)}")
  pair_l = list(pair)[:-1]
  score_mpnet = get_similarity_mpnet(pair_l)[0]
  score_use = get_similarity_use(pair_l)[0]
  pair_with_score = pair + (score_mpnet, score_use,)
  pairs_scores.append(pair_with_score)

Count --> 100/4000
Count --> 200/4000
Count --> 300/4000
Count --> 400/4000
Count --> 500/4000
Count --> 600/4000
Count --> 700/4000
Count --> 800/4000
Count --> 900/4000
Count --> 1000/4000
Count --> 1100/4000
Count --> 1200/4000
Count --> 1300/4000
Count --> 1400/4000
Count --> 1500/4000
Count --> 1600/4000
Count --> 1700/4000
Count --> 1800/4000
Count --> 1900/4000
Count --> 2000/4000
Count --> 2100/4000
Count --> 2200/4000
Count --> 2300/4000
Count --> 2400/4000
Count --> 2500/4000
Count --> 2600/4000
Count --> 2700/4000
Count --> 2800/4000
Count --> 2900/4000
Count --> 3000/4000
Count --> 3100/4000
Count --> 3200/4000
Count --> 3300/4000
Count --> 3400/4000
Count --> 3500/4000
Count --> 3600/4000
Count --> 3700/4000
Count --> 3800/4000
Count --> 3900/4000


In [11]:
df = pd.DataFrame(pairs_scores, columns=["text_1", "text_2", "ground_truth", "mpnet_score", "use_score"])
df.head()

Unnamed: 0,text_1,text_2,ground_truth,mpnet_score,use_score
0,"If I hacked into the NSA and Pentagon and dumped all their files, (1) how big would the manhunt for me be; (2) would it finally open the eyes of the public, and; (3) how would I be seen by the hacker circle?",We are maintaining our data as a flat file in server. What are the ways handled by hackers to open the flat file?,0,0.273686,0.420096
1,What are the best horror novel in 2016?,What are some of the best horror novels of 2014?,0,0.71115,0.922076
2,The Martian (2015 movie): Why couldn't Matt Damon grow more potatoes using the poop logic he used for the first time?,How plausible is the science in The Martian?,0,0.384449,0.516078
3,"Who is the better singer: Arijit Singh, Mohit Chauhan Or Sonu Nigam?",How many of you think KK is much better than Arijit Singh?,0,0.530873,0.587284
4,How's life going?,How is your life going?,0,0.942919,0.879934


In [12]:
threshold = 0.70
df["mpnet_score"] = df["mpnet_score"].apply(lambda x: 1 if x >= threshold else 0)
df["use_score"] = df["use_score"].apply(lambda x: 1 if x >= threshold else 0)
df.head()

Unnamed: 0,text_1,text_2,ground_truth,mpnet_score,use_score
0,"If I hacked into the NSA and Pentagon and dumped all their files, (1) how big would the manhunt for me be; (2) would it finally open the eyes of the public, and; (3) how would I be seen by the hacker circle?",We are maintaining our data as a flat file in server. What are the ways handled by hackers to open the flat file?,0,0,0
1,What are the best horror novel in 2016?,What are some of the best horror novels of 2014?,0,1,1
2,The Martian (2015 movie): Why couldn't Matt Damon grow more potatoes using the poop logic he used for the first time?,How plausible is the science in The Martian?,0,0,0
3,"Who is the better singer: Arijit Singh, Mohit Chauhan Or Sonu Nigam?",How many of you think KK is much better than Arijit Singh?,0,0,0
4,How's life going?,How is your life going?,0,1,1


In [13]:
ground_truth = np.array(df.ground_truth)
mpnet_score = np.array(df.mpnet_score)
use_score = np.array(df.use_score)

acc_mpnet = np.sum(ground_truth == mpnet_score) / len(ground_truth)
acc_use = np.sum(ground_truth == use_score) / len(ground_truth)
print(f"Accuracy MPNet --> {acc_mpnet}")
print(f"Accuracy USE --> {acc_use}")

Accuracy MPNet --> 0.804
Accuracy USE --> 0.74825
