In [1]:
import os
from tqdm import tqdm
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel, BertConfig
from torch.nn import CosineSimilarity
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import gensim

In [2]:
query = 'A man is eating a food.'
corpus = ['A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]

### Tf-idf

In [75]:
vect = TfidfVectorizer(ngram_range = (1,2), stop_words='english').fit(corpus)

In [76]:
query_vec = vect.transform([query])
corpus_vecs = vect.transform(corpus)

In [77]:
query_vec = query_vec.toarray()
corpus_vecs = corpus_vecs.toarray()

In [78]:
cos = CosineSimilarity(dim=1, eps=1e-6)
cos_results = cos(torch.tensor(query_vec), torch.tensor(corpus_vecs))

In [79]:
cos_results

tensor([0.6219, 0.0000, 0.1728, 0.0000, 0.0000, 0.1061, 0.0000, 0.0000],
       dtype=torch.float64)

In [80]:
corpus_sorted = [(x, float(y)) for y,x in sorted(zip(cos_results ,corpus), reverse = True)]

In [81]:
print('Text query: ', query)
print('Similarity (sorted): ')
for x, y in corpus_sorted:
    print('\t{}: \t{}'.format(x, y))

Text query:  A man is eating a food.
Similarity (sorted): 
	A man is eating a piece of bread.: 	0.6219211594482079
	A man is riding a horse.: 	0.17281739569158833
	A man is riding a white horse on an enclosed ground.: 	0.10610419209008183
	Two men pushed carts through the woods.: 	0.0
	The girl is carrying a baby.: 	0.0
	A woman is playing violin.: 	0.0
	A monkey is playing drums.: 	0.0
	A cheetah is running behind its prey.: 	0.0


### Word2vec

###### Download pretrained english: https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download

In [3]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [12]:
query_tokenized = list(gensim.utils.tokenize(query, lowercase=True))

In [9]:
corpus_tokenized = [list(gensim.utils.tokenize(text, lowercase=True)) for text in corpus]

In [13]:
query_tokenized

['a', 'man', 'is', 'eating', 'a', 'food']

In [11]:
corpus_tokenized

[['a', 'man', 'is', 'eating', 'a', 'piece', 'of', 'bread'],
 ['the', 'girl', 'is', 'carrying', 'a', 'baby'],
 ['a', 'man', 'is', 'riding', 'a', 'horse'],
 ['a', 'woman', 'is', 'playing', 'violin'],
 ['two', 'men', 'pushed', 'carts', 'through', 'the', 'woods'],
 ['a',
  'man',
  'is',
  'riding',
  'a',
  'white',
  'horse',
  'on',
  'an',
  'enclosed',
  'ground'],
 ['a', 'monkey', 'is', 'playing', 'drums'],
 ['a', 'cheetah', 'is', 'running', 'behind', 'its', 'prey']]

In [24]:
vocab = query_tokenized
for t in corpus_tokenized:
    vocab += t
vocab = list(set(vocab))
print(vocab)

['riding', 'men', 'of', 'bread', 'ground', 'behind', 'man', 'carrying', 'horse', 'playing', 'piece', 'drums', 'its', 'pushed', 'white', 'on', 'running', 'two', 'prey', 'woman', 'monkey', 'enclosed', 'is', 'an', 'the', 'cheetah', 'baby', 'eating', 'a', 'violin', 'woods', 'carts', 'food', 'girl', 'through']


##### Out of vocab cases:

In [26]:
for word in vocab:
    if word not in model:
        print(word)

of
a


In [68]:
import copy
def get_sent_vect(tokens, model):
    vect = []
    for t in tokens:
        if t in model:
            if len(vect) == 0:
                vect = copy.deepcopy(model[t])
            else:
                vect += copy.deepcopy(model[t])
#     vect = vect / len(vect)
    return vect

In [69]:
query_vect = get_sent_vect(query_tokenized, model)

In [70]:
corpus_vect = [get_sent_vect(tokens, model) for tokens in corpus_tokenized]

In [71]:
cos = CosineSimilarity(dim=1, eps=1e-6)
cos_results = cos(torch.tensor([query_vect]), torch.tensor(corpus_vect))

In [72]:
cos_results

tensor([0.7300, 0.7581, 0.8013, 0.7095, 0.6489, 0.8533, 0.6782, 0.6619])

In [73]:
corpus_sorted = [(x, float(y)) for y,x in sorted(zip(cos_results ,corpus), reverse = True)]

In [74]:
print('Text query: ', query)
print('Similarity (sorted): ')
for x, y in corpus_sorted:
    print('\t{}: \t{}'.format(x, y))

Text query:  A man is eating a food.
Similarity (sorted): 
	A man is riding a white horse on an enclosed ground.: 	0.8532745242118835
	A man is riding a horse.: 	0.8012802004814148
	The girl is carrying a baby.: 	0.7581433653831482
	A man is eating a piece of bread.: 	0.7300212383270264
	A woman is playing violin.: 	0.709453821182251
	A monkey is playing drums.: 	0.678205668926239
	A cheetah is running behind its prey.: 	0.6619023084640503
	Two men pushed carts through the woods.: 	0.6489415764808655


### BERT

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [8]:
query_tokenized_ids = tokenizer.encode(query, return_tensors='pt')
inputs = tokenizer.batch_encode_plus(corpus, pad_to_max_length=True, return_tensors='pt')['input_ids']

In [12]:
tokenizer.convert_ids_to_tokens(query_tokenized_ids[0])

['[CLS]', 'a', 'man', 'is', 'eating', 'a', 'food', '.', '[SEP]']

In [13]:
last_hidden_states, output_pooled = model(inputs)
query_last_hidden_states, query_pooled = model(query_tokenized_ids)

In [14]:
output_pooled.shape

torch.Size([8, 768])

In [15]:
sent_query_cls = query_last_hidden_states[:, 0, :]
sents_corpus_cls = last_hidden_states[:, 0, :]

In [16]:
sent_query_cls.shape

torch.Size([1, 768])

In [17]:
sents_corpus_cls.shape

torch.Size([8, 768])

In [18]:
cos = CosineSimilarity(dim=1, eps=1e-6)
cos_results = cos(sent_query_cls, sents_corpus_cls)

In [19]:
cos_results

tensor([0.8860, 0.7309, 0.7749, 0.7390, 0.7404, 0.8676, 0.7890, 0.8669],
       grad_fn=<DivBackward0>)

In [20]:
corpus_sorted = [(x, float(y)) for y,x in sorted(zip(cos_results ,corpus), reverse = True)]

In [21]:
print('Text query: ', query)
print('Similarity (sorted): ')
for x, y in corpus_sorted:
    print('\t{}: \t{}'.format(x, y))

Text query:  A man is eating a food.
Similarity (sorted): 
	A man is eating a piece of bread.: 	0.8859956860542297
	A man is riding a white horse on an enclosed ground.: 	0.8675525188446045
	A cheetah is running behind its prey.: 	0.8668953776359558
	A monkey is playing drums.: 	0.7889615297317505
	A man is riding a horse.: 	0.7749326229095459
	Two men pushed carts through the woods.: 	0.7404251098632812
	A woman is playing violin.: 	0.7390106320381165
	The girl is carrying a baby.: 	0.7309496402740479
