<a href="https://colab.research.google.com/github/zeeshansalim1234/Summer2021/blob/main/results_publication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [None]:

!pip install sentence-transformers
!pip install tweepy
!pip install bert-extractive-summarizer
!pip install nltk
!pip install google-cloud-vision



In [None]:
import pandas as pd
import json
import os
from sentence_transformers import SentenceTransformer, util
from google.colab import drive
import tweepy
import re
import os,io
from google.cloud import vision
from google.cloud.vision_v1 import types
import pandas as pd
import nltk
import numpy as np


# Query

In [None]:
sample_texts=["I joined Aerodesign, Dinobytes and Ignite as a freshmen at university.","In my first year of engineering I enrolled at AeroDesign","Ignite is a great club","freshmen did not attend the orientation"]
#sample_texts=["certain diets may help reduce the risk of cancer.","obesity and mental issues can be developed by excess intake of junk food","ronaldo is joining manchester united","michael jackson died due to cancer"]
#sample_texts=["Three years later, the coffin was still full of Jello.","The fish dreamed of escaping the fishbowl and into the toilet where he saw his friends go.","The person box was packed with jelly many dozens of moths later.","Jello is tasty"]
corpus=sample_texts[1:]
query=sample_texts[0]


In [None]:
print("Query: "+query+"\n")
for i in range(0,len(corpus)):
  print("Corpus["+str(i)+"]: "+corpus[i])

Query: I joined Aerodesign, Dinobytes and Ignite as a freshmen at university.

Corpus[0]: In my first year of engineering I enrolled at AeroDesign
Corpus[1]: Ignite is a great club
Corpus[2]: freshmen did not attend the orientation



# SPECTRE (with cosine-similarity)

In [None]:
model = SentenceTransformer('allenai-specter')   # loading specter model

In [None]:
corpus_embeddings=model.encode(corpus, convert_to_tensor=True)

In [None]:

query_embedding = model.encode(query, convert_to_tensor=True)

In [None]:
search_hits = util.semantic_search(query_embedding, corpus_embeddings,top_k=10)
search_hits = search_hits[0]

In [None]:
print(str(search_hits))

[{'corpus_id': 0, 'score': 0.8008255362510681}, {'corpus_id': 1, 'score': 0.7931039929389954}, {'corpus_id': 2, 'score': 0.7672163248062134}]


In [None]:
print("Result SPECTRE:\n")
count=0
for hit in search_hits:
  print(str(1+count)+") "+corpus[hit['corpus_id']]+"("+str(hit['score'])+")\n")
  count+=1
  

Result SPECTRE:

1) In my first year of engineering I enrolled at AeroDesign(0.8008255362510681)

2) Ignite is a great club(0.7931039929389954)

3) freshmen did not attend the orientation(0.7672163248062134)



# Base-BERT (with cosine-similarity)


In [None]:
# BERT
sentences=sample_texts
BERT_model_name='sentence-transformers/bert-base-nli-mean-tokens'


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
tokenizer=AutoTokenizer.from_pretrained(BERT_model_name)
BERT_model=AutoModel.from_pretrained(BERT_model_name)


In [None]:
tokens={'input_ids': [],'attention_mask': []}

for sentence in sentences:
  new_tokens=tokenizer.encode_plus(sentence,max_length=128,truncation=True,padding='max_length',return_tensors='pt')
  tokens['input_ids'].append(new_tokens['input_ids'][0])
  tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids']=torch.stack(tokens['input_ids'])
tokens['attention_mask']=torch.stack(tokens['attention_mask'])
outputs=BERT_model(**tokens)
embeddings=outputs.last_hidden_state
attention=tokens['attention_mask']
attention.shape
mask=attention.unsqueeze(-1).expand(embeddings.shape).float()
mask_embeddings=embeddings*mask
summed=torch.sum(mask_embeddings,1)
counts=torch.clamp(mask.sum(1),min=1e-9)
mean_pooled=summed/counts

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

mean_pooled=mean_pooled.detach().numpy()
result_bert=cosine_similarity([mean_pooled[0]],mean_pooled[1:])


In [None]:
print("Result BERT(Cosine):\n")
for i in range(0,len(result_bert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_bert[0][i])+")\n")

Result BERT(Cosine):

1) In my first year of engineering I enrolled at AeroDesign(0.7486016)

2) Ignite is a great club(0.3025604)

3) freshmen did not attend the orientation(0.3498309)



# Base-BERT(with euclidean distance)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

result_bert=euclidean_distances([mean_pooled[0]],mean_pooled[1:])

print("Result BERT(Euclidean):\n")
for i in range(0,len(result_bert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_bert[0][i])+")\n")

Result BERT(Euclidean):

1) In my first year of engineering I enrolled at AeroDesign(11.476399)

2) Ignite is a great club(19.299307)

3) freshmen did not attend the orientation(18.519054)



# TinyBERT (with cosine-similarity)

In [None]:
# TinyBERT

sentences=sample_texts
TinyBERT_model_name='sentence-transformers/paraphrase-TinyBERT-L6-v2'

tokenizer=AutoTokenizer.from_pretrained(TinyBERT_model_name)
TinyBERT_model=AutoModel.from_pretrained(TinyBERT_model_name)

tokenizer=AutoTokenizer.from_pretrained(TinyBERT_model_name)
TinyBERT_model=AutoModel.from_pretrained(TinyBERT_model_name)

tokens={'input_ids': [],'attention_mask': []}

for sentence in sentences:
  new_tokens=tokenizer.encode_plus(sentence,max_length=128,truncation=True,padding='max_length',return_tensors='pt')
  tokens['input_ids'].append(new_tokens['input_ids'][0])
  tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids']=torch.stack(tokens['input_ids'])
tokens['attention_mask']=torch.stack(tokens['attention_mask'])
outputs=TinyBERT_model(**tokens)
embeddings=outputs.last_hidden_state
attention=tokens['attention_mask']
attention.shape
mask=attention.unsqueeze(-1).expand(embeddings.shape).float()
mask_embeddings=embeddings*mask
summed=torch.sum(mask_embeddings,1)
counts=torch.clamp(mask.sum(1),min=1e-9)
mean_pooled=summed/counts

from sklearn.metrics.pairwise import cosine_similarity


mean_pooled=mean_pooled.detach().numpy()
result_tinybert=cosine_similarity([mean_pooled[0]],mean_pooled[1:])

In [None]:
print("Result TinyBERT:\n")
for i in range(0,len(result_tinybert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_tinybert[0][i])+")\n")

Result TinyBERT:

1) In my first year of engineering I enrolled at AeroDesign(0.5468293)

2) Ignite is a great club(0.34808004)

3) freshmen did not attend the orientation(0.1719764)



# TinyBERT(with manhattan distance)

In [None]:
# TinyBERT


sentences=sample_texts
TinyBERT_model_name='sentence-transformers/paraphrase-TinyBERT-L6-v2'

tokenizer=AutoTokenizer.from_pretrained(TinyBERT_model_name)
TinyBERT_model=AutoModel.from_pretrained(TinyBERT_model_name)

tokenizer=AutoTokenizer.from_pretrained(TinyBERT_model_name)
TinyBERT_model=AutoModel.from_pretrained(TinyBERT_model_name)

tokens={'input_ids': [],'attention_mask': []}

for sentence in sentences:
  new_tokens=tokenizer.encode_plus(sentence,max_length=128,truncation=True,padding='max_length',return_tensors='pt')
  tokens['input_ids'].append(new_tokens['input_ids'][0])
  tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids']=torch.stack(tokens['input_ids'])
tokens['attention_mask']=torch.stack(tokens['attention_mask'])
outputs=TinyBERT_model(**tokens)
embeddings=outputs.last_hidden_state
attention=tokens['attention_mask']
attention.shape
mask=attention.unsqueeze(-1).expand(embeddings.shape).float()
mask_embeddings=embeddings*mask
summed=torch.sum(mask_embeddings,1)
counts=torch.clamp(mask.sum(1),min=1e-9)
mean_pooled=summed/counts


from sklearn.metrics.pairwise import manhattan_distances


mean_pooled=mean_pooled.detach().numpy()
result_tinybert=manhattan_distances([mean_pooled[0]],mean_pooled[1:])

In [None]:
print("Result TinyBERT:\n")
for i in range(0,len(result_tinybert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_tinybert[0][i])+")\n")

Result TinyBERT:

1) In my first year of engineering I enrolled at AeroDesign(125.31879481545911)

2) Ignite is a great club(147.3852545104528)

3) freshmen did not attend the orientation(185.12669064272632)



# Count-Vectorizer(with euclidean distance)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

vectorizer=CountVectorizer()
features=vectorizer.fit_transform(sample_texts).todense()
vectorizer_results=[]
for f in features:
  vectorizer_results.append(euclidean_distances(features[0],f)[0][0])


In [None]:
print("Result CountVectorizer(Euclidean):\n")
for i in range(1,len(vectorizer_results)):
  print(str(i)+") "+sample_texts[i]+"("+str(vectorizer_results[i])+")")

Result CountVectorizer(Euclidean):

1) In my first year of engineering I enrolled at AeroDesign(3.7416573867739413)
2) Ignite is a great club(3.3166247903554)
3) freshmen did not attend the orientation(3.605551275463989)


# ROBERTA-Large(with cosine-similarity)

In [None]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
BERT_model = RobertaModel.from_pretrained('roberta-large')


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
tokens={'input_ids': [],'attention_mask': []}

for sentence in sentences:
  new_tokens=tokenizer.encode_plus(sentence,max_length=128,truncation=True,padding='max_length',return_tensors='pt')
  tokens['input_ids'].append(new_tokens['input_ids'][0])
  tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids']=torch.stack(tokens['input_ids'])
tokens['attention_mask']=torch.stack(tokens['attention_mask'])
outputs=BERT_model(**tokens)
embeddings=outputs.last_hidden_state
attention=tokens['attention_mask']
attention.shape
mask=attention.unsqueeze(-1).expand(embeddings.shape).float()
mask_embeddings=embeddings*mask
summed=torch.sum(mask_embeddings,1)
counts=torch.clamp(mask.sum(1),min=1e-9)
mean_pooled=summed/counts

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

mean_pooled=mean_pooled.detach().numpy()
result_bert=cosine_similarity([mean_pooled[0]],mean_pooled[1:])


In [None]:
print("Result Roberta(Cosine):\n")
for i in range(0,len(result_bert[0])):
  print(str(i+1)+") "+corpus[i]+"("+str(result_bert[0][i])+")\n")

Result Roberta(Cosine):

1) In my first year of engineering I enrolled at AeroDesign(0.9964082)

2) Ignite is a great club(0.996409)

3) freshmen did not attend the orientation(0.9930457)



# Doc2vec(with cosine-similarity)

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
data =sample_texts
corpus=sample_texts[1:]

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

#to find the vector of a document which is not in training data
test_data = word_tokenize("I love building chatbots".lower())
v1 = model.infer_vector(test_data)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('0')
print(similar_doc)




[('1', 0.9940474033355713), ('2', 0.9930206537246704), ('3', 0.9928597211837769)]


# STS

In [None]:
#@title Load the Universal Sentence Encoder's TF Hub module
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
""""
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
"""
model = SentenceTransformer('allenai-specter')   # loading specter model"""

def embed(input):
  return model(input)

In [None]:
import pandas
import scipy
import math
import csv

sts_dataset = tf.keras.utils.get_file(
    fname="Stsbenchmark.tar.gz",
    origin="http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz",
    extract=True)
sts_dev = pandas.read_table(
    os.path.join(os.path.dirname(sts_dataset), "stsbenchmark", "sts-dev.csv"),
    error_bad_lines=False,
    skip_blank_lines=True,
    usecols=[4, 5, 6],
    names=["sim", "sent_1", "sent_2"])
sts_test = pandas.read_table(
    os.path.join(
        os.path.dirname(sts_dataset), "stsbenchmark", "sts-test.csv"),
    error_bad_lines=False,
    quoting=csv.QUOTE_NONE,
    skip_blank_lines=True,
    usecols=[4, 5, 6],
    names=["sim", "sent_1", "sent_2"])
# cleanup some NaN values in sts_dev
sts_dev = sts_dev[[isinstance(s, str) for s in sts_dev['sent_2']]]

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
sts_data = sts_dev #@param ["sts_dev", "sts_test"] {type:"raw"}

def run_sts_benchmark(batch):
  sts_encode1 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_1'].tolist())), axis=1)
  sts_encode2 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_2'].tolist())), axis=1)
  cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)

  scores = 1.0 - tf.acos(cosine_similarities) / math.pi
  print(scores)
  """Returns the similarity scores"""
  return scores

count=0
dev_scores = sts_data['sim'].tolist()
scores = []
for batch in np.array_split(sts_data, 10):
  scores.extend(run_sts_benchmark(batch))
  count+=1
print(scores)
print(dev_scores)

pearson_correlation = scipy.stats.pearsonr(scores, dev_scores)
print('Pearson correlation coefficient = {0}\np-value = {1}'.format(
    pearson_correlation[0], pearson_correlation[1]))

tf.Tensor(
[0.9302299  0.91503537 0.8858436  0.81598777 0.82499564 0.7876028
 0.842623   0.7653434  0.85355747 0.831003   0.79168063 0.69776666
 0.84689224 0.8538161  0.84618706 0.66369575 0.82812774 0.88605976
 0.86942405 0.68831384 0.7562237  0.69235843 0.79055065 0.6884094
 0.68400353 0.72296345 0.7797556  0.78858244 0.63529944 0.60642076
 0.7827172  0.82756484 0.8482493  0.8761115  0.85381556 0.80827314
 0.8993039  0.87050664 0.85451055 0.6994206  0.78373015 0.77094877
 0.6498592  0.80625653 0.71406555 0.62295604 0.6294185  0.5808517
 0.79910994 0.81647944 0.62979925 0.74218786 0.7761008  0.83404756
 0.6216673  0.6567863  0.62281734 0.6109237  0.75858635 0.5805177
 0.7166968  0.7928418  0.62633    0.8074719  0.76240706 0.61664796
 0.7904217  0.8163034  0.72792274 0.761857   0.7445427  0.70667005
 0.7072114  0.6327467  0.5465035  0.78934056 0.6994473  0.7767464
 0.62703764 0.7213806  0.7389682  0.5994146  0.5685327  0.7267419
 0.842376   0.59308857 0.6855235  0.74041915 0.76750153 0

In [None]:
sts_data = sts_dev #@param ["sts_dev", "sts_test"] {type:"raw"}

def run_sts_benchmark(batch):
  corpus_embeddings=model.encode(batch['sent_1'].tolist(), convert_to_tensor=True)
  query_embedding=model.encode(batch['sent_2'].tolist(), convert_to_tensor=True)
  cosine_similarities = util.cos_sim(query_embedding,corpus_embeddings)
  cosine_similarities=cosine_similarities[0]
  scores = cosine_similarities
  """Returns the similarity scores"""
  return scores

count=0
dev_scores = sts_data['sim'].tolist()
scores = []
for batch in np.array_split(sts_data, 10):
  scores.extend(run_sts_benchmark(batch))
  count+=1
print(scores)
print(dev_scores)


pearson_correlation = scipy.stats.pearsonr(scores, dev_scores)
print('Pearson correlation coefficient = {0}\np-value = {1}'.format(
    pearson_correlation[0], pearson_correlation[1]))

AttributeError: ignored