In [1]:
# Code Source from: https://www.kernix.com/blog/similarity-measure-of-textual-documents_p12

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords

In [4]:
stop_words = stopwords.words('english')

In [5]:
word_tokenize('there is no cow level')

['there', 'is', 'no', 'cow', 'level']

In [6]:
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    return doc

In [7]:
# Fetch ng20 dataset
ng20 = fetch_20newsgroups(subset='train',
remove=('headers', 'footers', 'quotes'))

In [8]:
# text and labels
texts, y = ng20.data, ng20.target
corpus = [preprocess(text) for text in texts]

In [9]:
def filter_docs(corpus, texts, labels, condition_on_doc):
    """
     Filter corpus, texts and labels given the function condition_on_doc which takes
     a doc.
     The document doc is kept if condition_on_doc(doc) is true.
     """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
        if condition_on_doc(doc)]

    labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts, labels)

In [10]:
corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: (len(doc) != 0))

320 docs removed


In [11]:
import numpy as np
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

In [12]:
# measure the semantic relationships of documents.

In [13]:
# First method: Latent Semantic Indexing

In [None]:
sims = {'ng20': {}}
dictionary = corpora.Dictionary(corpus)
corpus_gensim = [dictionary.doc2bow(doc) for doc in corpus]
tfidf = TfidfModel(corpus_gensim)
corpus_tfidf = tfidf[corpus_gensim]
lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
lsi_index = MatrixSimilarity(lsi[corpus_tfidf])
sims['ng20']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]]
for i in range(len(corpus))])

In [None]:
# Second method: centroid of the word vectors

from gensim.models import KeyedVectors

# load pretrained word2vec model
filename = 'GoogleNews-vectors-negative300.bin.gz'
word2vec_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [None]:
word2vec_model.init_sims(replace=True)

In [None]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [None]:
# As we cannot compute a mean for a document for which no words is in the word2vec dictionary 
# (the model is not trained on the same corpus, so a word can be missing in the word2vec dictionary), 
# we need to filter our corpus to delete those documents:

def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

corpus, texts, y = filter_docs(corpus, texts, y,
                                   lambda doc: has_vector_representation(word2vec_model, doc))

snippets, _, snippets_labels = filter_docs(snippets, None, snippets_labels,
                                               lambda doc: has_vector_representation(word2vec_model, doc))