In [3]:
import pickle
import glob
from collections import defaultdict
import operator

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
# from nltk.cluster import KMeansClusterer, euclidean_distance
# from gensim import corpora, models, utils
from numpy import array
from gensim import corpora, models


In [4]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [206]:
doc_file = 'tmp.txt'
with open(doc_file, 'w') as f:
    for ii in documents:
        f.write("%s\n" % ii)
    
with open(doc_file, 'r') as f:
    adocuments = f.readlines()

# Parse the document

In [250]:
def parse_doc(document):
    '''Parse a single doc
    removing whitespaces, punctuations, stopwords, and stemming words
    '''
    
    tokenizer = RegexpTokenizer(r'\w+')
    stop = stopwords.words('english')
    stemmer = LancasterStemmer()

    # Remove non-alphanumeric character
    intermediate = tokenizer.tokenize(document)    

    # Take lower case
    intermediate = [i.lower() for i in intermediate]

    # Remove stop words
    intermediate = [i for i in intermediate if i not in stop]

    # Get stems        
    parsed = [stemmer.stem(i) for i in intermediate]

    return parsed
    
    
class IndexCreator(object):
    '''
    
    '''
    
    def __init__(self, doc_file=None, docs=None):
        '''
        Take input, either docs file, or existing Index
        '''

        self.doc_file = doc_file
        self.documents = docs
        self.processed = None
        self._word_index = defaultdict(lambda: defaultdict(list))
        
    def _read_doc_file(self):
        '''read doc file if not None'''
        
        if self.doc_file is not None:
            with open(self.doc_file, 'r') as f:
                documents = f.readlines()
        
            if self.documents is not None:
                print "Overwrite document with that read from the file"
            
            self.documents = documents
        
    def _parse_collection(self):
        # parse all docs  
        processed = []
        for document in self.documents:    
            processed.append(parse_doc(document))
        self.processed = processed
        
        
    def _build_index(self):    
        '''Build inverted index'''

        for i, doc in enumerate(self.processed):
            for j, word in enumerate(doc):
                self._word_index[word][i].append(j)
                
    def create_index(self):
        '''Master function to create inex'''
        
        self._read_doc_file()
        if self.documents is None:
            print "Failed to read documents"
            return
        self._parse_collection()
        self._build_index()
        
    @property
    def word_index(self):
        return dict(self._word_index)
    
    def dump_index(self, file_name='word_index.p'):
        with open(file_name, 'w') as f:
            pickle.dump((dict(self._word_index), self.documents), f)

# Query by Words, rank by docs containing most words

In [296]:
Query = 'system human'
parsed = word_parser(Query)

class WordQuerier(object):
    '''
    '''
    
    def __init__(self, index_file=None, word_index=None, documents=None):
        '''
        '''
        
        self.index_file = index_file
        self.word_index = word_index
        self.documents = documents
        self.doc_freq = None
        
        self._read_index_file()
        
    def _read_index_file(self):
        '''read index file if not None'''
        
        if self.index_file is not None:
            with open(self.index_file, 'r') as f:
                word_index, documents = pickle.load(f)
        
            if self.word_index is not None:
                print "Overwrite word index with that read from the file"
            
            self.word_index = word_index
            self.documents = documents
            
    def query_words(self, words):

        parsed = parse_doc(words)    

        self.doc_freq = defaultdict(int)
        for word in parsed:        
            if word not in self.word_index:
                continue
            word_dict_value = self.word_index[word].items()
            ind = word_dict_value

            # Get doc and positions of word in doc
            # Below turns out faster than: ind = sorted(tmp.items(), key=lambda x: len(operator.itemgetter(1)(x)), reverse=True)
            # ind = sorted(word_dict_value, key=lambda x: len(x[1]), reverse=True)
            
            # Get doc frequency for all words
            for i in ind:
                self.doc_freq[i[0]] += 1

        ind_sorted = sorted(self.doc_freq.items(), key=operator.itemgetter(1), reverse=True)
        return [self.documents[i[0]] for i in ind_sorted]

In [290]:
a = IndexCreator(doc_file='tmp.txt', docs=adocuments)
a.create_index()
a.dump_index('tmp.p')

Overwrite document with that read from the file


In [297]:
words = 'system human search'
b = WordQuerier(index_file='tmp.p')
c = b.query_words(words)

# Rank by similarity of tfidf

1. Get idf of each term in corpus, and tf for each document
1. For a new query, get all terms in it
1. If there is no duplicate terms, the tfidf vector of the query is proportional to the idf for all terms in it
1. Compute the TFIDF vector for each doc only using terms in the query
1. Compute similarity

In [8]:
tfidf = models.TfidfModel(corpus)

2017-07-28 11:13:47,055 : INFO : collecting document frequencies
2017-07-28 11:13:47,057 : INFO : PROGRESS: processing document #0
2017-07-28 11:13:47,061 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [10]:
corpus_tfidf = tfidf[corpus]
for i in corpus_tfidf:
    print i

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


# Similarity
https://radimrehurek.com/gensim/tut3.html

In [29]:
dictionary = corpora.Dictionary.load(os.path.join(TEMP_FOLDER, 'deerwester.dict'))
corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm'))

2017-07-28 11:47:01,660 : INFO : loading Dictionary object from /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.dict
2017-07-28 11:47:01,662 : INFO : loaded /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.dict
2017-07-28 11:47:01,664 : INFO : loaded corpus index from /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.mm.index
2017-07-28 11:47:01,666 : INFO : initializing corpus reader from /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.mm
2017-07-28 11:47:01,668 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


In [30]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

2017-07-28 11:47:21,126 : INFO : using serial LSI version on this node
2017-07-28 11:47:21,128 : INFO : updating model with new documents
2017-07-28 11:47:21,130 : INFO : preparing a new chunk of documents
2017-07-28 11:47:21,132 : INFO : using 100 extra samples and 2 power iterations
2017-07-28 11:47:21,133 : INFO : 1st phase: constructing (12, 102) action matrix
2017-07-28 11:47:21,134 : INFO : orthonormalizing (12, 102) action matrix
2017-07-28 11:47:21,137 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2017-07-28 11:47:21,139 : INFO : computing the final decomposition
2017-07-28 11:47:21,141 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)
2017-07-28 11:47:21,142 : INFO : processed documents up to #9
2017-07-28 11:47:21,144 : INFO : topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"
2017-07-28 11:47:21,145 : INFO : topic #1(2

In [31]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print vec_lsi, vec_bow

[(0, 0.46182100453271624), (1, 0.070027665279001478)] [(1, 1), (2, 1)]


In [32]:
index = similarities.MatrixSimilarity(lsi[corpus])

2017-07-28 12:00:03,458 : INFO : creating matrix with 9 documents and 2 features


In [35]:
index.save(os.path.join(TEMP_FOLDER, 'deerwester.index'))
index = similarities.MatrixSimilarity.load(os.path.join(TEMP_FOLDER, 'deerwester.index'))

2017-07-28 12:00:50,739 : INFO : saving MatrixSimilarity object under /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.index, separately None
2017-07-28 12:00:50,742 : INFO : saved /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.index
2017-07-28 12:00:50,744 : INFO : loading MatrixSimilarity object from /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.index
2017-07-28 12:00:50,746 : INFO : loaded /var/folders/pv/r00j5j0n5g99lmw66v7rhlqc0000gn/T/deerwester.index


In [37]:
sims = index[vec_lsi]
print list(enumerate(sims))

[(0, 0.99809301), (1, 0.93748635), (2, 0.99844527), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.098794632), (8, 0.050041769)]


In [41]:
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]