<a href="https://colab.research.google.com/github/zeeshansalim1234/Summer2021/blob/main/results_publication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [798]:

!pip install sentence-transformers
!pip install tweepy
!pip install bert-extractive-summarizer
!pip install nltk
!pip install google-cloud-vision



In [799]:
import pandas as pd
import json
import os
from sentence_transformers import SentenceTransformer, util
from google.colab import drive
import tweepy
import re
import os,io
from google.cloud import vision
from google.cloud.vision_v1 import types
import pandas as pd
import nltk
import numpy as np


# Query

In [858]:
#sample_texts=["play the record","record the play","play the video"]
sample_texts=["Three years later, the coffin was still full of Jello.","The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.","The person box was packed with jelly many dozens of moths later.","Jello is tasty"]
corpus=sample_texts[1:]
query=sample_texts[0]


In [865]:
print("Query: "+query+"\n")
for i in range(0,len(corpus)):
  print("Corpus["+str(i)+"]: "+corpus[i])

Query: Three years later, the coffin was still full of Jello.

Corpus[0]: The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.
Corpus[1]: The person box was packed with jelly many dozens of moths later.
Corpus[2]: Jello is tasty
Corpus[3]: cancer


# SPECTRE (with cosine-similarity)

In [866]:
model = SentenceTransformer('allenai-specter')   # loading specter model

In [868]:
corpus_embeddings=model.encode(corpus, convert_to_tensor=True)

In [869]:

query_embedding = model.encode(query, convert_to_tensor=True)

In [870]:
search_hits = util.semantic_search(query_embedding, corpus_embeddings,top_k=10)
search_hits = search_hits[0]

In [871]:
print(str(search_hits))

[{'corpus_id': 1, 'score': 0.8889927864074707}, {'corpus_id': 2, 'score': 0.8098993301391602}, {'corpus_id': 0, 'score': 0.8050113320350647}, {'corpus_id': 3, 'score': 0.7104994058609009}]


In [872]:
print("Result SPECTRE:\n")
count=0
for hit in search_hits:
  print(str(1+count)+") "+corpus[hit['corpus_id']]+"("+str(hit['score'])+")\n")
  count+=1
  

Result SPECTRE:

1) The person box was packed with jelly many dozens of moths later.(0.8889927864074707)

2) Jello is tasty(0.8098993301391602)

3) The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.(0.8050113320350647)

4) cancer(0.7104994058609009)



# Base-BERT (with cosine-similarity)


In [873]:
# BERT
sentences=sample_texts
BERT_model_name='sentence-transformers/bert-base-nli-mean-tokens'


In [874]:
from transformers import AutoTokenizer, AutoModel
import torch

In [875]:
tokenizer=AutoTokenizer.from_pretrained(BERT_model_name)
BERT_model=AutoModel.from_pretrained(BERT_model_name)


In [876]:
tokens={'input_ids': [],'attention_mask': []}

for sentence in sentences:
  new_tokens=tokenizer.encode_plus(sentence,max_length=128,truncation=True,padding='max_length',return_tensors='pt')
  tokens['input_ids'].append(new_tokens['input_ids'][0])
  tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids']=torch.stack(tokens['input_ids'])
tokens['attention_mask']=torch.stack(tokens['attention_mask'])
outputs=BERT_model(**tokens)
embeddings=outputs.last_hidden_state
attention=tokens['attention_mask']
attention.shape
mask=attention.unsqueeze(-1).expand(embeddings.shape).float()
mask_embeddings=embeddings*mask
summed=torch.sum(mask_embeddings,1)
counts=torch.clamp(mask.sum(1),min=1e-9)
mean_pooled=summed/counts

In [877]:
from sklearn.metrics.pairwise import cosine_similarity

mean_pooled=mean_pooled.detach().numpy()
result_bert=cosine_similarity([mean_pooled[0]],mean_pooled[1:])


In [878]:
print("Result BERT(Cosine):\n")
for i in range(0,len(result_bert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_bert[0][i])+")\n")

Result BERT(Cosine):

1) The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.(0.33093363)

2) The person box was packed with jelly many dozens of moths later.(0.6258964)

3) Jello is tasty(0.40863195)

4) cancer(0.25527912)



# Base-BERT(with euclidean distance)

In [879]:
from sklearn.metrics.pairwise import euclidean_distances

result_bert=euclidean_distances([mean_pooled[0]],mean_pooled[1:])

print("Result BERT(Euclidean):\n")
for i in range(0,len(result_bert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_bert[0][i])+")\n")

Result BERT(Euclidean):

1) The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.(18.581127)

2) The person box was packed with jelly many dozens of moths later.(13.716298)

3) Jello is tasty(17.838797)

4) cancer(20.03004)



# TinyBERT (with cosine-similarity)

In [880]:
# TinyBERT

sentences=sample_texts
TinyBERT_model_name='sentence-transformers/paraphrase-TinyBERT-L6-v2'

tokenizer=AutoTokenizer.from_pretrained(TinyBERT_model_name)
TinyBERT_model=AutoModel.from_pretrained(TinyBERT_model_name)

tokenizer=AutoTokenizer.from_pretrained(TinyBERT_model_name)
TinyBERT_model=AutoModel.from_pretrained(TinyBERT_model_name)

tokens={'input_ids': [],'attention_mask': []}

for sentence in sentences:
  new_tokens=tokenizer.encode_plus(sentence,max_length=128,truncation=True,padding='max_length',return_tensors='pt')
  tokens['input_ids'].append(new_tokens['input_ids'][0])
  tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids']=torch.stack(tokens['input_ids'])
tokens['attention_mask']=torch.stack(tokens['attention_mask'])
outputs=TinyBERT_model(**tokens)
embeddings=outputs.last_hidden_state
attention=tokens['attention_mask']
attention.shape
mask=attention.unsqueeze(-1).expand(embeddings.shape).float()
mask_embeddings=embeddings*mask
summed=torch.sum(mask_embeddings,1)
counts=torch.clamp(mask.sum(1),min=1e-9)
mean_pooled=summed/counts


mean_pooled=mean_pooled.detach().numpy()
result_tinybert=cosine_similarity([mean_pooled[0]],mean_pooled[1:])

In [881]:
print("Result TinyBERT:\n")
for i in range(0,len(result_tinybert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_tinybert[0][i])+")\n")

Result TinyBERT:

1) The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.(0.16096786)

2) The person box was packed with jelly many dozens of moths later.(0.21702203)

3) Jello is tasty(0.5115617)

4) cancer(0.1285898)



# Count-Vectorizer(with euclidean distance)


In [882]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

vectorizer=CountVectorizer()
features=vectorizer.fit_transform(sample_texts).todense()
vectorizer_results=[]
for f in features:
  vectorizer_results.append(euclidean_distances(features[0],f)[0][0])


In [883]:
print("Result CountVectorizer(Euclidean):\n")
for i in range(1,len(vectorizer_results)):
  print(str(i)+") "+sample_texts[i]+"("+str(vectorizer_results[i])+")")

Result CountVectorizer(Euclidean):

1) The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.(5.0)
2) The person box was packed with jelly many dozens of moths later.(3.7416573867739413)
3) Jello is tasty(3.3166247903554)
4) cancer(3.3166247903554)


# STS

In [947]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
"""
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
"""
model = SentenceTransformer('allenai-specter')   # loading specter model
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [948]:
import pandas
import scipy
import math
import csv

sts_dataset = tf.keras.utils.get_file(
    fname="Stsbenchmark.tar.gz",
    origin="http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz",
    extract=True)
sts_dev = pandas.read_table(
    os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-dev.csv"),
    error_bad_lines=False,
    skip_blank_lines=True,
    usecols=[4, 5, 6],
    names=["sim", "sent_1", "sent_2"])
sts_test = pandas.read_table(
    os.path.join(
        os.path.dirname(sts_dataset), "stsbenchmark", "sts-test.csv"),
    error_bad_lines=False,
    quoting=csv.QUOTE_NONE,
    skip_blank_lines=True,
    usecols=[4, 5, 6],
    names=["sim", "sent_1", "sent_2"])
# cleanup some NaN values in sts_dev
sts_dev = sts_dev[[isinstance(s, str) for s in sts_dev['sent_2']]]

In [None]:
sts_data = sts_dev #@param ["sts_dev", "sts_test"] {type:"raw"}

def run_sts_benchmark(batch):
  sts_encode1 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_1'].tolist())), axis=1)
  sts_encode2 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_2'].tolist())), axis=1)
  cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)
  clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)
  scores = 1.0 - tf.acos(clip_cosine_similarities) / math.pi
  print(scores)
  """Returns the similarity scores"""
  return scores

count=0
dev_scores = sts_data['sim'].tolist()
scores = []
for batch in np.array_split(sts_data, 10):
  scores.extend(run_sts_benchmark(batch))
  count+=1
print(scores)
print(dev_scores)

pearson_correlation = scipy.stats.pearsonr(scores, dev_scores)
print('Pearson correlation coefficient = {0}\np-value = {1}'.format(
    pearson_correlation[0], pearson_correlation[1]))

In [949]:
sts_data = sts_dev #@param ["sts_dev", "sts_test"] {type:"raw"}

def run_sts_benchmark(batch):
  corpus_embeddings=model.encode(batch['sent_1'].tolist(), convert_to_tensor=True)
  print(corpus_embeddings)
  query_embedding=model.encode(batch['sent_2'].tolist(), convert_to_tensor=True)
  cosine_similarities = util.cos_sim(corpus_embeddings,query_embedding)
  
  """Returns the similarity scores"""
  return scores

count=0
dev_scores = sts_data['sim'].tolist()
scores = []
for batch in np.array_split(sts_data, 10):
  scores.extend(run_sts_benchmark(batch))
  count+=1
print(scores)
print(dev_scores)

pearson_correlation = scipy.stats.pearsonr(scores, dev_scores)
print('Pearson correlation coefficient = {0}\np-value = {1}'.format(
    pearson_correlation[0], pearson_correlation[1]))

tensor([[-0.7240,  0.4943,  0.3280,  ...,  0.7217,  0.2838,  0.9984],
        [-0.1791,  0.0862,  0.7585,  ...,  0.5637, -0.6136,  0.9263],
        [-0.5648,  0.2241, -0.0707,  ...,  0.0960,  0.0409,  0.0091],
        ...,
        [-0.3696, -0.0465,  0.0101,  ...,  0.1096, -0.4859,  0.2426],
        [-0.9151,  0.5695,  0.3251,  ...,  0.7222,  0.4850,  1.1786],
        [-0.2413,  0.2298,  0.0778,  ...,  0.3689, -0.1202,  0.4072]])
tensor([[-0.5603,  0.0867,  0.0911,  ..., -0.2811, -0.1409,  1.0207],
        [-0.5730,  0.4517,  0.5408,  ...,  0.6669, -0.2083,  0.6010],
        [-0.1315,  0.0501,  0.1877,  ...,  0.2441, -0.5864,  1.3464],
        ...,
        [-0.2551,  0.4869,  0.1325,  ..., -0.1725, -0.4059,  0.1329],
        [-0.6723,  0.3509,  0.3281,  ...,  0.6964, -0.3846,  0.7508],
        [-0.3340,  0.8520, -0.0315,  ...,  0.1320,  0.0378,  0.7145]])
tensor([[-0.1982,  0.6213, -0.1159,  ...,  0.5121, -0.4424,  0.7374],
        [-1.6208,  0.1651,  0.0928,  ...,  0.4666, -0.8313,  0

IndexError: ignored