In [18]:
from gensim.scripts.glove2word2vec import glove2word2vec

# file = '/home/stirunag/pre-trained_word_embeddings/glove/glove.6B.50d.txt'
# glove2word2vec(glove_input_file=file, word2vec_output_file="gensim_glove.6B.50d.txt")

###Finally, read the word2vec txt to a gensim model using KeyedVectors:

from gensim.models.keyedvectors import KeyedVectors

# glove_model = KeyedVectors.load_word2vec_format("gensim_glove.6B.50d.txt", binary=False)


# glove_model = KeyedVectors.load_word2vec_format("/home/synoptica/google_wv/GoogleNews-vectors-negative300.bin", binary=True)
# glove_model = KeyedVectors.load_word2vec_format("/home/stirunag/models/model_OTAR_200d-3mc-10it.bin", binary=True)
glove_model = KeyedVectors.load_word2vec_format("/home/stirunag/pre-trained_word_embeddings/PubMed-w2v.bin", binary=True)


In [19]:
# euclidean distance between two vectors
import math
def l2_dist(v1, v2):
    sum = 0.0
    if len(v1) == len(v2):
        for i in range(len(v1)):
            delta = v1[i] - v2[i]
            sum += delta * delta
        return math.sqrt(sum)

In [20]:
import numpy as np
from scipy import spatial

index2word_set = set(glove_model.wv.index2word)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec



  after removing the cwd from sys.path.


In [21]:
# https://en.wikipedia.org/wiki/Sensor_fusion
# https://en.wikipedia.org/wiki/Inverse-variance_weighting

def WeInVar_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    var_combined = 0.0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, np.divide(model[word], np.var(model[word])))
            var_combined = var_combined + np.divide(1, np.var(model[word]))
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, var_combined)
    return feature_vec


In [23]:
def print_similarities(sentence_1, sentence_2):
    s1_afv = avg_feature_vector(sentence_1, model=glove_model,   num_features=200, index2word_set=index2word_set)
    s2_afv = avg_feature_vector(sentence_2, model=glove_model,num_features=200, index2word_set=index2word_set)
    
    print('consine: ' + str(1 - spatial.distance.cosine(s1_afv, s2_afv)))
    print('Euclidean: '+ str(1-l2_dist(s1_afv, s2_afv)))
    
    print('\n*** Proposed ****\n')
    
    s1_cfv = WeInVar_feature_vector(sentence_1, model=glove_model,num_features=200, index2word_set=index2word_set)
    s2_cfv = WeInVar_feature_vector(sentence_2, model=glove_model,num_features=200, index2word_set=index2word_set)
    
    print('consine: ' + str(1 - spatial.distance.cosine(s1_cfv, s2_cfv)))
    print('Euclidean: '+ str(l2_dist(s1_cfv, s2_cfv)))


In [24]:
# Test case 1
# The two sentences are completely equivalent, as they mean the same thing.  

sentence_1 = 'Hydrolysis of β-lactam antibiotics by β-lactamases is the most common mechanism of resistance for this class of antibacterial agents in clinically important Gram-negative bacteria.'
sentence_2 = 'In Gram-negative organisms, the most common β-lactam resistance mechanism involves β-lactamase mediated hydrolysis resulting in subsequent inactivation of the antibiotic.'

print_similarities(sentence_1, sentence_2)


consine: 0.8062567371440514
Euclidean: 0.41394073634083695

*** Proposed ****

consine: 0.883587113170943
Euclidean: 0.39562342386087146


In [13]:
# Test case 2
# The two sentences are mostly equivalent, but some unimportant details differ.

sentence_1 = 'In May 2010, the troops attempted to invade Kabul.'
sentence_2 = 'The US army invaded Kabul on May 7th last year, 2010.'

print_similarities(sentence_1, sentence_2)


      

consine: 0.5039790158821237
Euclidean: -15.246293521639725

*** Proposed ****

consine: 0.5280778514742848
Euclidean: -3.0574552481511166


In [14]:
# Test 3
# The two sentences are roughly equivalent, but some important information differs/missing.

sentence_1 = 'We were able to confirm that the cancer tissues had reduced expression of miR-126 and miR-424,and increased expression of miR-15b, miR-16,miR-146a, miR-155, and miR-223.'
sentence_2 = 'A recent study showed that the expression of miR-126 and miR424 had reduced by the cancer tissues.'

print_similarities(sentence_1, sentence_2)    

consine: 0.9216786957209082
Euclidean: -9.207444581963573

*** Proposed ****

consine: 0.23665655201919966
Euclidean: -24.47056554001784


In [16]:
# Test 4
# The two sentences are not equivalent, but share some details.
sentence_1 = 'This article discusses the current data on using anti HER2 therapies to treat CNS metastasis as well as the newer anti-HER2 agents.'
sentence_2 = 'Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis.'


print_similarities(sentence_1, sentence_2)           

consine: 0.7911708736132597
Euclidean: -15.65566962575403

*** Proposed ****

consine: 0.7404254659009778
Euclidean: -15.967624796314077


In [15]:
# Test 5
# The two sentences are not equivalent, but are on the same topic.

sentence_1 = 'Membrane proteins are proteins that interact with biological membranes.'
sentence_2 = 'Previous studies have demonstrated that membrane proteins are implicated in many diseases because they are positioned at the apex of signaling pathways that regulate cellular processes.'

print_similarities(sentence_1, sentence_2)

consine: 0.5671597595232845
Euclidean: -21.537127749201833

*** Proposed ****

consine: 0.530497981571845
Euclidean: -20.492187119457732


In [17]:
# Test 6
# The two sentences are on same topics.

sentence_1 = 'Here we show that both C/EBPα and NFI-A bind the region responsible for miR-223 upregulation upon RA treatment'
sentence_2 = 'Isoleucine could not interact with ligand fragment 44 (LF44),which contains amino group.'

print_similarities(sentence_1, sentence_2)      

consine: 0.4648352878144746
Euclidean: -18.469715874829976

*** Proposed ****

consine: 0.12280367033678063
Euclidean: -17.11616624141387
