In [1]:
import os

import numpy as np
import scipy.spatial.distance as distance
from gensim import corpora, models, similarities, matutils, utils
from gensim.models.doc2vec import TaggedDocument

In [2]:
# (similarity, document1, document2)
documents = []

path = os.path.join('stsbenchmark', 'sts-train.csv')

with open(path, 'r') as f:
    for line in list(f):
        parts = line.strip().split('\t')
        
        sim = float(parts[4])
        doc1 = list(utils.simple_tokenize(parts[5]))
        doc2 = list(utils.simple_tokenize(parts[6]))

        documents.append((sim, doc1, doc2))

In [3]:
for triple in documents[:5]:
    print(triple)

(5.0, ['A', 'plane', 'is', 'taking', 'off'], ['An', 'air', 'plane', 'is', 'taking', 'off'])
(3.8, ['A', 'man', 'is', 'playing', 'a', 'large', 'flute'], ['A', 'man', 'is', 'playing', 'a', 'flute'])
(3.8, ['A', 'man', 'is', 'spreading', 'shreded', 'cheese', 'on', 'a', 'pizza'], ['A', 'man', 'is', 'spreading', 'shredded', 'cheese', 'on', 'an', 'uncooked', 'pizza'])
(2.6, ['Three', 'men', 'are', 'playing', 'chess'], ['Two', 'men', 'are', 'playing', 'chess'])
(4.25, ['A', 'man', 'is', 'playing', 'the', 'cello'], ['A', 'man', 'seated', 'is', 'playing', 'the', 'cello'])


In [4]:
sims, docs1, docs2 = zip(*documents)

docs = docs1 + docs2 # list of all documents

dictionary = corpora.Dictionary(docs)

In [5]:
print(dictionary)

Dictionary(12955 unique tokens: ['A', 'is', 'off', 'plane', 'taking']...)


In [6]:
# vector representation of documents
corpus1 = [dictionary.doc2bow(doc) for doc in docs1]
corpus2 = [dictionary.doc2bow(doc) for doc in docs2]

common_corpus = corpus1 + corpus2

In [7]:
# corpus is a list of sparse vectors
print(common_corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (1, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(0, 1), (1, 1), (5, 1), (8, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(9, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(0, 1), (1, 1), (8, 1), (9, 1), (19, 1), (20, 1)]]


In [8]:
def predict_similarities(model, corpus1, corpus2):
    '''Compare corpus1[i] to corpus2[i] for all i 
    and return the cosine similarity values'''
    return [matutils.cossim(model[corpus1[i]], model[corpus2[i]])
            for i in range(len(corpus1))]

In [9]:
def correl(pred, real):
    '''Returns pearson correlation coef of pred and real'''
    return np.corrcoef(pred, real)[0, 1]

### Term Frequency * Inverse Document Frequency

Tf-idf can be seen as a modified version of term-document matrix which takes into account the frequency of the words. Words that occur in smaller amount of documents are likely to reveal more information than common words (such as stopwords) and therefore higher weight is assigned to them. Tf-idf value for a word is $\textbf{term frequency} \times \textbf{inverse document frequency}$

For word $i$ in document $j$ $\text{tf}_{ij}$ can be read from the term-document matrix and $\text{idf}_{i} = \log \left( \frac{N}{df_i} \right)$ where $N$ is the total number of documents and $\text{df}_i$ is the amount of documents where word $i$ occurs.

Combining these results in tf-idf matrix where weight for word $i$ in document $j$ is $w_{ij} = \text{tf}_{ij} \times \text{idf}_i$

More information: Chapter 15

In [10]:
tfidf = models.TfidfModel(common_corpus, id2word=dictionary)

In [11]:
tfidf_pred = predict_similarities(tfidf, corpus1, corpus2)
correl(tfidf_pred, sims)

0.6228996261601334

### Latent Semantic Indexing

LSI is based on SVD which is one way to factorize a matrix. When SVD is applied to the term-document matrix it is factorized into $W \times \Sigma \times C^T$ where rows of $W$ represent words in latent space and columns are ordered w.r.t importance (singular values), $\Sigma$ is a diagonal matrix that contains the singular values indicating importance and $C^T$ is a matrix representing the documents.

Since the most important details are captured by the columns corresponding to the largest singular values, a dense vector representation can be obtained by truncating matrix $W$ from $|V| \times m$ to $|V| \times k$

More information: Chaper 16

In [12]:
lsi = models.LsiModel(common_corpus, id2word=dictionary)

In [13]:
lsi_pred = predict_similarities(lsi, corpus1, corpus2)
correl(lsi_pred, sims)

0.2650836598138713

### Random Projections

Intuition: In RP model the original data is projected into k-dimensional subspace using a random matrix $R$:

$X^{RP}_{k \times N} = R_{k \times d} X_{d \times N}$

Johnson-Lindenstrauss lemma: distances are approximately preserved if points in vector space are projected onto a randomly selected subspace of suitably high dimension

Strictly speaking this is not a projection since $R$ has not chosen to be orthogonal. However, there exists much larger amount of almost orthogonal than orthogonal directions and therefore random directions are sufficiently close to orthogonal directions.

Simple way to initialize $R$ is

$r _ { i j } = \sqrt { 3} \cdot \left\{ \begin{array} { l l } { + 1} & { \text{ with probability } \frac { 1} { 6} } \\ { 0} & { \text{ with probability } \frac { 2} { 3} } \\ { - 1} & { \text{ with probability } \frac { 1 } { 6} } \end{array} \right.$

More information: https://users.ics.aalto.fi/ella/publications/randproj_kdd.pdf

In [14]:
rp = models.RpModel(common_corpus, id2word=dictionary)

In [15]:
rp_pred = predict_similarities(rp, corpus1, corpus2)
correl(rp_pred, sims)

0.47392675093096365

### Latent Dirichlet Allocation (LDA)

LDA is a probabilistic extension of LSA. More information: https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

In [16]:
lda = models.LdaModel(common_corpus, id2word=dictionary)

  diff = np.log(self.expElogbeta)


In [17]:
lda_pred = predict_similarities(lda, corpus1, corpus2)
correl(lda_pred, sims)

0.3248790289498413

### Hierarchial Dirichlet Process (HDP)

HDP is a non-parametric bayesian method. More information: http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf

In [18]:
hdp = models.HdpModel(common_corpus, id2word=dictionary)

In [19]:
hdp_pred = predict_similarities(hdp, corpus1, corpus2)
correl(hdp_pred, sims)

0.03723236282081314

### LogEntropy model

Maps BoW representation into log entropy space. 

More information:

https://radimrehurek.com/gensim/models/logentropy_model.html

https://stats.stackexchange.com/questions/215418/difference-between-log-entropy-model-and-tf-idf-model

In [20]:
logentropy = models.LogEntropyModel(common_corpus)

In [21]:
logentropy_pred = predict_similarities(logentropy, corpus1, corpus2)
correl(logentropy_pred, sims)

0.6233722042921176

### Mean of word2vec

A document can be transformed by using the mean of the transformed words.

More information:
https://radimrehurek.com/gensim/models/word2vec.html

In [22]:
word2vec = models.Word2Vec(docs, min_count=1, size=100)

In [23]:
word2vec_pred = []

for i in range(len(docs1)):

    # mean of word2vecs for doc1 and doc2
    doc1 = np.mean([word2vec.wv[word] for word in docs1[i]], axis=0)
    doc2 = np.mean([word2vec.wv[word] for word in docs2[i]], axis=0)
    
    # doc1 and doc2 are dense vectors so let's use cosine distance from scipy
    #
    # cossim = 1 - cosdist
    cossim = 1 - distance.cosine(doc1, doc2)
    
    word2vec_pred.append(cossim)
    
correl(word2vec_pred, sims)

0.06897800962297461

### Doc2Vec

Doc2Vec is a generalizatoin of Word2Vec which makes it possible to transform whole documents into a fixed-size vectors. Authors of Doc2Vec paper claim it to overcome the weaknesses of BoW representation: loss of word order and ignoring the semantics.

More information:

https://cs.stanford.edu/~quocle/paragraph_vector.pdf

https://radimrehurek.com/gensim/models/doc2vec.html

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

In [24]:
# docs into list of TaggedDocuments
tagged_docs = [TaggedDocument(docs[i], [i]) for i in range(len(docs))]

In [25]:
tagged_docs[:10]

[TaggedDocument(words=['A', 'plane', 'is', 'taking', 'off'], tags=[0]),
 TaggedDocument(words=['A', 'man', 'is', 'playing', 'a', 'large', 'flute'], tags=[1]),
 TaggedDocument(words=['A', 'man', 'is', 'spreading', 'shreded', 'cheese', 'on', 'a', 'pizza'], tags=[2]),
 TaggedDocument(words=['Three', 'men', 'are', 'playing', 'chess'], tags=[3]),
 TaggedDocument(words=['A', 'man', 'is', 'playing', 'the', 'cello'], tags=[4]),
 TaggedDocument(words=['Some', 'men', 'are', 'fighting'], tags=[5]),
 TaggedDocument(words=['A', 'man', 'is', 'smoking'], tags=[6]),
 TaggedDocument(words=['The', 'man', 'is', 'playing', 'the', 'piano'], tags=[7]),
 TaggedDocument(words=['A', 'man', 'is', 'playing', 'on', 'a', 'guitar', 'and', 'singing'], tags=[8]),
 TaggedDocument(words=['A', 'person', 'is', 'throwing', 'a', 'cat', 'on', 'to', 'the', 'ceiling'], tags=[9])]

In [26]:
doc2vec = models.Doc2Vec(tagged_docs, epochs=50)

In [27]:
doc2vec_pred = []

for i in range(len(docs1)):
    doc1 = doc2vec.infer_vector(docs1[i])
    doc2 = doc2vec.infer_vector(docs2[i])
    
    cossim = 1 - distance.cosine(doc1, doc2)
    
    doc2vec_pred.append(cossim)
    
correl(doc2vec_pred, sims)

0.18285143797097164