In [1]:
import pandas as pd
from collections import defaultdict
from nltk.tokenize import TweetTokenizer
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from enum import Enum
import math
import pickle
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from gensim.corpora import Dictionary
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from collections import OrderedDict



In [2]:
NUM_TOPICS = 100
WORD2VEC_FILE="C:\\Users\\mowonibi\\Downloads\\crawl-300d-2M.vec"

In [3]:
class QueryTermMode(Enum): 
    """ 
    Enum Class :
    Enumeration of functions to to use for the mode in combining the results of each query term
    """
    OR = set.union # all documents containing any of the query terms should be returned
    AND = set.intersection #only documents containing all of the query terms to be returned

class Helper:
    """
    Class containing any helper function to be used in the application
    """
    
    tknzr = TweetTokenizer()
    en_stop = set(nltk.corpus.stopwords.words('english'))
    @staticmethod
    def custom_tokenizer( sentence):
        """
        lemmatize lower case tokens of sentence tokenized using nltk tweet tokenizer
        Returns list of tokens
        """
        return [WordNetLemmatizer().lemmatize(token.lower() ) for token in Helper.tknzr.tokenize(sentence) if token not in Helper.en_stop and len(token) > 1]
    
    @staticmethod
    def simple_spliter_tokenizer(sentence, splitter="/"):
        """
        simple splitter based on user-defined regex expressions
        Returns list of tokens
        """
        return [token.lower()  for token in sentence.split(splitter)]

In [22]:
class RankFunction :
    
    @staticmethod
    def avg_embedding_similarity(inverted_index, doc_set =None):
        """
        Computes the BM25 scores of document given a query strinng and the query field
        Returns dictionary of document ids and BM25 scores
        """

        def avg_embedding_similarity_ranking(field_name, query_terms_as_string):
            docScore  =defaultdict(float)
            tokenized_text = inverted_index.field_names[field_name](query_terms_as_string)
            query_emb =inverted_index.get_avg_doc_embedding(tokenized_text)
            docs_wth_term =set()
            for word in tokenized_text:
                docs_wth_term =set.union(docs_wth_term, inverted_index.term_frequency[field_name][word].keys())
            #print(len(docs_wth_term))
            #print(query_emb)
            for doc_id in docs_wth_term:
                #print(doc_id)
                d_scor = 1- cosine_similarity(query_emb.reshape( 1,300), inverted_index.doc_avg_embeddings[field_name][doc_id].reshape( 1,300))[0]
                docScore [doc_id] = d_scor
                #print(doc_id, d_scor)
            return docScore
        return avg_embedding_similarity_ranking
    
    
    
    @staticmethod
    def lda_similarity(inverted_index, doc_set =None):
        """
        Computes the BM25 scores of document given a query strinng and the query field
        Returns dictionary of document ids and BM25 scores
        """
        def lda_similarity_ranking(field_name, query_terms_as_string):
            docScore  =defaultdict(float)
            tokenized_text = inverted_index.field_names[field_name](query_terms_as_string)
            #query_emb =inverted_index.get_avg_doc_embedding(tokenized_text)
            new_doc_bow = inverted_index.field_dict[field_name].doc2bow(tokenized_text)
            ldamodel=inverted_index.field_model[field_name]
            doc_topic = np.zeros((1,100))
            topic_tuple_list =ldamodel.get_document_topics(new_doc_bow)
            #print(topic_tuple_list)
            for top_ind,val in topic_tuple_list:
                doc_topic[0,top_ind]=val  
            docs_wth_term =set()
            for word in tokenized_text:
                docs_wth_term =set.union(docs_wth_term, inverted_index.term_frequency[field_name][word].keys())
            
            for doc_id in docs_wth_term:
                
                new_doc_bowx = inverted_index.field_dict[field_name].doc2bow(inverted_index.document_tokens[field_name][doc_id])
                doc_topicx = np.zeros((1,100))
                for top_ind,val in ldamodel.get_document_topics(new_doc_bowx):
                    doc_topicx[0,top_ind]=val
                
                
                d_scor = 1- cosine_similarity(doc_topicx,doc_topic )[0]
                docScore [doc_id] = d_scor
                #print(doc_id, d_scor)
            return docScore
        return lda_similarity_ranking
    
    
    
    @staticmethod
    def BM25_score(inverted_index, doc_set =None, PARAM_K1=1.2, PARAM_B=0.75, EPSILON=0.25):
        """
        Computes the BM25 scores of document given a query strinng and the query field
        Returns dictionary of document ids and BM25 scores
        """
        #https://github.com/nhirakawa/BM25/blob/master/src/rank.py - consider this implementation
        #https://www.elastic.co/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables
        #https://www.quora.com/How-does-BM25-work
        #https://github.com/nhirakawa/BM25/blob/master/src/query.py
        
        def bm25_ranking(field_name, query_terms_as_string):
            docScore  =defaultdict(float)
            tokenized_text = inverted_index.field_names[field_name](query_terms_as_string)
            for word in tokenized_text:
                docs_wth_term =inverted_index.term_frequency[field_name][word]
                for doc, qtf in docs_wth_term.items():
                    #print(self.doc_index.avg_doc_len[field_name], self.doc_index.document_length[field_name][doc])
                    idf = inverted_index.idf[field_name][word] #if self.doc_index.idf[field_name][word] >= 0 else EPSILON * self.doc_index.average_idf[field_name]
                    docScore[doc] += (idf * qtf * (PARAM_K1 + 1)
                      / (qtf + PARAM_K1 * (1 - PARAM_B + PARAM_B * inverted_index.document_length[field_name][doc] / inverted_index.avg_doc_len[field_name])))
            return docScore
        return bm25_ranking

In [5]:
class Inverted_Index:
    """indexer class for managing the index i.e. adding index, removing index etc"""
    
    def __init__(self, file_location="products/products.csv",field_names={"category" :Helper.simple_spliter_tokenizer ,"title":Helper.custom_tokenizer,"description":Helper.custom_tokenizer}):
        self.file_location = file_location #file to index
        self.field_names  = field_names #This contains the set of field names used in the index and their tokenizer
        self.term_frequency =defaultdict( lambda: defaultdict(lambda: defaultdict(int)) ) # maps/dict of dict/maps of field -> term ->docs-> freq of occurence in doc
        self.document_tokens = defaultdict( lambda: defaultdict(list) ) #field  ->docs ->terms
        self.idf =defaultdict (lambda: dict() )#idf of field-> term-> idf
        self.inverted_index_term_proximity = defaultdict( lambda:defaultdict(lambda: defaultdict(list))) #field ->term ->doc ->list of positions in document
        self.corpus =None #pandas dataframe of the products 
        self.document_length=defaultdict( lambda:defaultdict(lambda: defaultdict(int))) #field -> document_id -> length of document
        self.num_of_doc=0 #total number of doc /products
        self.avg_doc_len = dict() #field -> average document length
        self.avg_idf = dict() #field -> avg idf
        self.field_dict=dict()
        self.field_corpus=dict()
        self.field_model=dict()
        self.term_embeddings= dict() #term-> embedding  #defaultdict(lambda: np.zeros((300,))) 
        self.doc_avg_embeddings=defaultdict(lambda: defaultdict(lambda: np.zeros((300,)))) # field ->doc ->embedding
    
    def index_texts(self, file_location=None, include_doc2vec=False, index_doc_embedding=False):
        """indexs the files in the text"""
        sum_of_doc_len=defaultdict(int) #field ->sum of document lenght  which can be used to compute average document length lateron
        field_doc_tokens=defaultdict(lambda: defaultdict(list)) #field->doc->tokens
        if file_location is None:
            file_location = self.file_location
        self.corpus = pd.read_csv(file_location)
        
        for index, row in self.corpus.iterrows(): #loop rows of products
            self.num_of_doc += 1
            #if (self.num_of_doc >20):
            #    break
            for field_name, tokenizer  in self.field_names.items():   #loops each field
                tokenized_text = tokenizer(row[field_name])
                if include_doc2vec :
                    field_doc_tokens[field_name][index]=tokenized_text
                current_doc_len= len(tokenized_text)
                self.document_length[field_name][index] =current_doc_len
                sum_of_doc_len[field_name] +=current_doc_len
                self.document_tokens[field_name][index]=tokenized_text

                for k,word in enumerate(tokenized_text):     #loops each tokenized text
                    self.inverted_index_term_proximity[field_name][word][index].append( k)
                    self.term_frequency[field_name] [word][index] += 1

        sum_of_idf=defaultdict(float) #field -> sum of the idf
        #computes the idf of each term in the corpus
        for field_name, field_tf in self.term_frequency.items(): 
            for word, tf in field_tf.items():
                self.idf[field_name][word] = math.log(self.num_of_doc - len(tf) + 0.5) - math.log(len(tf) + 0.5)
                sum_of_idf[field_name]+=self.idf[field_name][word] #aggregate/sums the idf by summing to the aggregator
        
        for field_name in self.field_names:
            self.avg_doc_len[field_name] = sum_of_doc_len[field_name] / self.num_of_doc
            self.avg_idf[field_name] = sum_of_idf[field_name] / self.num_of_doc
        
        if include_doc2vec :
            self._index_doc2vec(  field_doc_tokens)
        
        if index_doc_embedding:
            self.index_doc_avg_embedding()
    
    def _index_doc2vec(self,  field_doc_tokens):
        #field_corpus =dict()
        #field_dict =dict()
        for field_name, _  in self.field_names.items():
            dictionary = corpora.Dictionary([list(self.term_frequency[field_name].keys())])
            t_documents = [dictionary.doc2bow(text) for  text in  field_doc_tokens[field_name].values()]  #create bog of words (multi-set)
            #model = gensim.models.LdaMulticore(t_documents, num_topics = NUM_TOPICS, id2word=dictionary, passes=30, workers=3)
            #model.save(field_name +'_model100.gensim')
            #dictionary.save(field_name +'_dictionary.gensim')
            dictionary = Dictionary.load(field_name_+'dictionary.gensim')
            model = LdaModel.load(field_name_ +"model100.gensim", mmap='r')
            self.field_dict[field_name]=dictionary
            self.field_corpus[field_name]=t_documents
            self.field_model[field_name]=model
            
    def index_doc_avg_embedding (self):
        self.load_term_embedding()
        for field_name  in self.field_names.keys():
            for doc_id  in self.document_tokens[field_name].keys():
                self.doc_avg_embeddings[field_name][doc_id] = self.get_avg_doc_embedding(self.document_tokens[field_name][doc_id]) 
                   
                
    def get_avg_doc_embedding (self, tokens):
        embedding_accum = np.zeros((300,))
        d_len=0
        for tkn in tokens:
            if (tkn in self.term_embeddings.keys()):
                embedding_accum +=self.term_embeddings[tkn]
                d_len+=1
        return embedding_accum/d_len

    def  load_term_embedding(self):
        term_set= set()
        for field_name in self.field_names.keys():
            term_set = set.union(term_set,self.term_frequency[field_name].keys() )
        #print(term_set)
        with open(WORD2VEC_FILE, encoding='utf8') as f:
            next(f)
            for l in f:
                w = l.split(' ')
                if w[0] in term_set:
                    self.term_embeddings[w[0]] = np.array([float(x) for x in w[1:301]]) 

In [6]:
class Searcher:
    """class for searching document based on a given index"""
    def __init__(self, doc_index ):
        self.doc_index = doc_index
        
    def field_term_search(self, field_name, query_tokenized_terms_list, max_separation_between_terms = None, q_mode=QueryTermMode.OR) : #max_separation_between_terms removes effect of q_mode
        """
        search list of tokens, in a specific field, given query mode and max separation between terms in a given document
        Returns list of set of doucment/product id
        """
        
        result_set =None
        prev_result_set=None
        if max_separation_between_terms is None :
            for q_term in query_tokenized_terms_list:
                result_set =set(self.doc_index.inverted_index_term_proximity[field_name][q_term].keys()) if result_set is None else q_mode(result_set, set(self.doc_index.inverted_index_term_proximity[field_name][q_term].keys()))
        else :
            for q_term in tokenized_terms_list:
                if result_set is None :
                    prev_result_set =self.doc_index.inverted_index_term_proximity[field_name][q_term]
                    result_set = set(self.doc_index.inverted_index_term_proximity[field_name][q_term].keys())
                else :
                    temp_result_set = set()
                    curr_result_set = self.doc_index.inverted_index_term_proximity[field_name][q_term]
                    for doc_id in QueryTermMode.AND(result_set, set(curr_result_set.keys())):
                        positions_current_term =curr_result_set[doc_id]
                        positions_previous_term =prev_result_set[doc_id]
                        for tc_position in positions_current_term :
                            for tp_position in positions_previous_term:
                                tp_tc =tc_position - tp_position
                                if (tp_tc>0 and tp_tc<=max_separation_between_terms ):
                                    temp_result_set.add(doc_id)
                                    break
                    result_set =temp_result_set
                    prev_result_set = curr_result_set             
        return result_set

    
    def term_search(self, query_terms_as_string, q_mode=QueryTermMode.OR):
        """
        search for query string in all fields
        Returns set of document/product ids 
        """
        result_set=None
        for field_name, tokenizer  in self.doc_index.field_names.items():   
            tokenized_text = tokenizer(query_terms_as_string)
            result_set   = self.field_term_search (field_name, tokenized_text, q_mode=q_mode) if result_set is None else QueryTermMode.OR(result_set, self.field_term_search (field_name, tokenized_text, q_mode=q_mode))   
        
        return result_set
    
    def ranked_search(self, field_name, query_terms_as_string, length, ranking_function):
        """Returns the `size` most relevant documents based on the `query`"""
        scores = ranking_function(field_name, query_terms_as_string)
        scores=sorted(scores,  key = lambda x: (-scores[x], x))
        #indexes= list(scores.keys())
        return scores[:length]
      

In [7]:
%%time
nltk.download('stopwords')
inv_ind = Inverted_Index()
inv_ind.index_texts(include_doc2vec=True, index_doc_embedding=False)

  diff = np.log(self.expElogbeta)


Wall time: 45min


In [10]:
inv_ind.field_dict['title'].save("title" +'_dictionary.gensim')
#dictionary.save(field_name +'_dictionary.gensim')

In [12]:
%%time
sso =Searcher(inv_ind)
ssss = sso.term_search("android iphone ipad", QueryTermMode.AND)
print(len(ssss))

54
Wall time: 1 ms


In [28]:
ranking_alg = RankFunction.lda_similarity(inv_ind)
ranking_alg2 = RankFunction.BM25_score(inv_ind)

In [29]:
sso.ranked_search("description", "android iphone ipad", 10,ranking_alg)


[2354, 2364, 2366, 2369, 2374, 2384, 2385, 2388, 2395, 2398]

In [30]:
sso.ranked_search("description", "android iphone ipad", 10,ranking_alg2)


[75814, 75831, 93734, 51592, 63858, 105597, 56691, 13010, 32735, 123506]

In [31]:
inv_ind.corpus.description.values[75814:75815]

array(["Part of the connected lifestyle range from Netatmo, the Weather Station for Smartphones has an indoor and outdoor module so you can effectively adapt you lifestyle according to the environment. Indoor module Place this module anywhere inside the home to maximise your family's comfort. Using the app you can access: indoor temperature, relative humidity, indoor air quality, C02 readings and even a sound meter so you can live in a healthier home. Outdoor module This module means that you can plan your outdoor activities in accordance to real-time weather readings. With the app, you can access data to: outdoor temperature, outdoor relative humidity, outdoor air quality, barometric pressure and the weather. Analyse your data The Netatmo web app will display all data in the form of graphs. This not only allows you to observe the cycles and forecast variations around you, but you can also get a more accurate idea of your environment over time. Design With its simple, ergonomic and use

In [None]:
BM25_score