In [1]:
from gensim.scripts.glove2word2vec import glove2word2vec

# file = '/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.50d.txt'
# glove2word2vec(glove_input_file=file, word2vec_output_file="gensim_glove.6B.50d.txt")

###Finally, read the word2vec txt to a gensim model using KeyedVectors:

from gensim.models.keyedvectors import KeyedVectors

# glove_model = KeyedVectors.load_word2vec_format("gensim_glove.6B.50d.txt", binary=False)


glove_model = KeyedVectors.load_word2vec_format("/home/synoptica/google_wv/GoogleNews-vectors-negative300.bin", binary=True)

In [12]:
# euclidean distance between two vectors
import math
def l2_dist(v1, v2):
    sum = 0.0
    if len(v1) == len(v2):
        for i in range(len(v1)):
            delta = v1[i] - v2[i]
            sum += delta * delta
        return math.sqrt(sum)

In [13]:
import numpy as np
from scipy import spatial

index2word_set = set(glove_model.wv.index2word)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec



  after removing the cwd from sys.path.


In [14]:
# https://en.wikipedia.org/wiki/Sensor_fusion
# https://en.wikipedia.org/wiki/Inverse-variance_weighting

def WeInVar_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    var_combined = 0.0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, np.divide(model[word], np.var(model[word])))
            var_combined = var_combined + np.divide(1, np.var(model[word]))
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, var_combined+n_words)
    return feature_vec


In [15]:
def print_similarities(sentence_1, sentence_2):
    s1_afv = avg_feature_vector(sentence_1, model=glove_model,   num_features=300, index2word_set=index2word_set)
    s2_afv = avg_feature_vector(sentence_2, model=glove_model,num_features=300, index2word_set=index2word_set)
    
    print('consine: ' + str(1 - spatial.distance.cosine(s1_afv, s2_afv)))
    print('Euclidean: '+ str(1-l2_dist(s1_afv, s2_afv)))
    
    print('\n*** Proposed ****\n')
    
    s1_cfv = WeInVar_feature_vector(sentence_1, model=glove_model,num_features=300, index2word_set=index2word_set)
    s2_cfv = WeInVar_feature_vector(sentence_2, model=glove_model,num_features=300, index2word_set=index2word_set)
    
    print('consine: ' + str(1 - spatial.distance.cosine(s1_cfv, s2_cfv)))
    print('Euclidean: '+ str(1-l2_dist(s1_cfv, s2_cfv)))


In [16]:
# Test case 1
# The two sentences are completely equivalent, as they mean the same thing.  

sentence_1 = 'The bird is bathing in the sink.'
sentence_2 = 'Birdie is washing itself in the water basin.'

print_similarities(sentence_1, sentence_2)


consine: 0.6539125177578349
Euclidean: 0.056384929834607966

*** Proposed ****

consine: 0.9236403283615906
Euclidean: 0.6276778890216066


In [17]:
# Test case 2
# The two sentences are mostly equivalent, but some unimportant details differ.

sentence_1 = 'In May 2010, the troops attempted to invade Kabul.'
sentence_2 = 'The US army invaded Kabul on May 7th last year, 2010.'

print_similarities(sentence_1, sentence_2)


      

consine: 0.6682252848367785
Euclidean: -0.03880286941085709

*** Proposed ****

consine: 0.735840847325616
Euclidean: 0.25239685010484103


In [18]:
# Test 3
# The two sentences are roughly equivalent, but some important information differs/missing.

sentence_1 = 'John said he is considered a witness but not a suspect.'
sentence_2 = 'He is not a suspect anymore." John said.'

print_similarities(sentence_1, sentence_2)    

consine: 0.7894748904362346
Euclidean: 0.27037969500919634

*** Proposed ****

consine: 0.8878459569072047
Euclidean: 0.48346821967648423


In [19]:
# Test 4
# The two sentences are not equivalent, but share some details.

sentence_1 = 'They flew out of the nest in groups.'
sentence_2 = 'They flew into the nest together.'

print_similarities(sentence_1, sentence_2)           

consine: 0.9539640381008274
Euclidean: 0.5670408861243774

*** Proposed ****

consine: 0.9219473037366839
Euclidean: 0.59916840555105


In [20]:
# Test 5
# The two sentences are not equivalent, but are on the same topic.
    
sentence_1 = 'The woman is playing the violin.'
sentence_2 = 'The young lady enjoys listening to the guitar.'

print_similarities(sentence_1, sentence_2)

consine: 0.6929881319764487
Euclidean: 0.05788454002226906

*** Proposed ****

consine: 0.8759480898375229
Euclidean: 0.508549488160922


In [21]:
# Test 6
# The two sentences are on different topics.

sentence_1 = 'John went horse back riding at dawn with a whole group of friends.'
sentence_2 = 'Sunrise at dawn is a magnificent view to take in if you wake up early enough for it.'

print_similarities(sentence_1, sentence_2)      

consine: 0.653876231695043
Euclidean: 0.19453509021267557

*** Proposed ****

consine: 0.7143262707738096
Euclidean: 0.320769877087943
