In [3]:
from IPython.core.display import HTML
import jieba

class Searcheriindex():
    def __init__(self,title_file):
        self.index = dict()
        self.max_id = 0
        self.title_list = []

        with open(title_file, 'r') as f:
            titles = f.read()
            
        for title in titles.split():
            self.add_doc(title)
            
    def add_doc(self,doc):
        self.title_list.append(doc) 
        for term in list(jieba.cut_for_search(doc)):
            if term in self.index:
                self.index[term].add(self.max_id)
            else:
                self.index[term] = set([self.max_id])
        self.max_id += 1
        return self.max_id - 1
    
    def word_match(self,word):
        result = None
        for term in list(jieba.cut(word)):
            if result is None:
                result = self.index.get(term,set())
            else:
                result = result & self.index.get(term,set())
        if result is None:
            result = set()      
        return result

    def conv_query(self,query):
        query_new_parts = []
        all_parts = list(jieba.cut(query))
        idx = 0
        cache = ""
        count_parts = len(all_parts)
        while idx < count_parts:
            if all_parts[idx] == "(" or all_parts[idx] == ")":
                query_new_parts.append(all_parts[idx])
            elif all_parts[idx] == " ":
                query_new_parts.append(" ")
            elif all_parts[idx] in ("and", "AND", "+"):
                query_new_parts.append("&")
            elif all_parts[idx] in ("or", "OR"):
                query_new_parts.append("|")
            elif all_parts[idx] in ("not", "NOT", "-"):
                query_new_parts.append("-")
            elif (idx + 1 < count_parts
                  and all_parts[idx+1] not in (" ", ")")):
                cache += all_parts[idx]
            elif (idx + 2 < count_parts
                  and all_parts[idx+1] == " "
                  and all_parts[idx+2] not in ("(", ")", "and", "AND", "+", "or", "OR", "not", "NOT", "-", " ",)):
                query_new_parts.append("self.word_match('{}') & ".format(all_parts[idx]))
            else:
                query_new_parts.append("self.word_match('{}')".format(cache + all_parts[idx]))
                cache = ""
            idx += 1
        query_new = " ".join(query_new_parts)
        return query_new

    def highlighter(self,doc,word):
        for part in list(jieba.cut(word)):
            if part not in ("(", ")", "and", "AND", "or", "OR", "not", "NOT", " "):
                doc = doc.replace(part, '<span style="color:red">{}</span>'.format(part))
        return doc

    def search(self,query):
        result = []
        query_new = self.conv_query(query)
        for did in eval(query_new):
            result.append(self.highlighter(self.title_list[did], query))
        return result

In [4]:
searcher = Searcheriindex("tiles.txt")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\FJC\AppData\Local\Temp\jieba.cache
Loading model cost 0.704 seconds.
Prefix dict has been built succesfully.


In [5]:
query = "苹果芯片 and (三星 or 华为)"
print("|".join(jieba.cut(query)))
print(searcher.conv_query(query))
for title in searcher.search(query):
    display(HTML(title))

query = "3 - 0"
print("|".join(jieba.cut(query)))
print(searcher.conv_query(query))
for title in searcher.search(query):
    display(HTML(title))

苹果|芯片| |and| |(|三星| |or| |华为|)
self.word_match('苹果芯片')   &   ( self.word_match('三星')   |   self.word_match('华为') )


3| |-| |0
self.word_match('3')   -   self.word_match('0')


In [18]:
import string

class SearcherIIndexVII(Searcheriindex):
            
    def parse_doc(self, doc):
        result = []
        state_last = ''
        cache = ''
        for c in doc:
            state_c = c in string.ascii_letters \
                or c.isdigit() \
                or c in ('-', ':', '.')
            if c == ' ':
                if state_last:
                    result.append(cache)
                else:
                    result.extend(list(jieba.cut_for_search(cache)))
                if state_last != '':     
                    result.append(' ')
                cache = ''
                state_last = '' 
            else:
                if state_c == state_last:
                    cache += c
                else:
                    if state_last != '':
                        if state_last:
                            result.append(cache)
                        else:
                            result.extend(list(jieba.cut_for_search(cache)))
                    cache = c
                state_last = state_c
        if cache:
            if state_last:
                result.append(cache)
            else:
                result.extend(list(jieba.cut_for_search(cache)))
        return result
    
    def parse_query(self, doc):
        result = []
        state_last = ''
        cache = ''
        for c in doc:
            state_c = c in string.ascii_letters \
                or c.isdigit() \
                or c in ('-', ':', '.')
            if c == ' ':
                if state_last:
                    result.append(cache)
                else:
                    result.extend(list(jieba.cut(cache)))
                if state_last != '':result.append(' ')
                cache = ''
                state_last = '' 
            else:
                if state_c == state_last:
                    cache += c
                else:
                    if state_last != '':
                        if state_last:
                            result.append(cache)
                        else:
                            result.extend(list(jieba.cut(cache)))
                    cache = c
                state_last = state_c
        if cache:
            if state_last:
                result.append(cache)
            else:
                result.extend(list(jieba.cut(cache)))
        return result
    
    def add_doc(self, doc):
        """向索引中添加新文档
        
        Args:
            doc:待检索的文档(文本)
        
        Returns:
            新增文档ID
        """
        self.title_list.append(doc)
        for term in self.parse_doc(doc):
            #构建和更新各Term对应的Posting(集合)
            if term in self.index: 
                self.index[term].add(self.max_id)
            else:
                self.index[term] = set([self.max_id])
        self.max_id += 1
        return self.max_id - 1
    
    def dumpIndex(self):
        print(self.index)
        
    def conv_query(self,query):
        query_new_parts = []
        all_parts = list(self.parse_query(query))
        idx = 0
        cache = ""
        count_parts = len(all_parts)
        while idx < count_parts:
            if all_parts[idx] == "(" or all_parts[idx] == ")":
                query_new_parts.append(all_parts[idx])
            elif all_parts[idx] == " ":
                query_new_parts.append(" ")
            elif all_parts[idx] in ("and", "AND", "+"):
                query_new_parts.append("&")
            elif all_parts[idx] in ("or", "OR"):
                query_new_parts.append("|")
            elif all_parts[idx] in ("not", "NOT", "-"):
                query_new_parts.append("-")
            elif (idx + 1 < count_parts
                  and all_parts[idx+1] not in (" ", ")")):
                cache += "self.term_match('{}') & ".format(all_parts[idx])
            elif (idx + 2 < count_parts
                  and all_parts[idx+1] == " "
                  and all_parts[idx+2] not in ("(", ")", "and", "AND", "+", "or", "OR", "not", "NOT", "-")):
                query_new_parts.append("self.term_match('{}') & ".format(all_parts[idx]))
                idx += 2
                continue
            else:
                query_new_parts.append(cache + "self.term_match('{}')".format(all_parts[idx]))
                cache = ""
            idx += 1
        query_new = " ".join(query_new_parts)
        return query_new
    
    def term_match(self,term):
        return self.index.get(term, set())


In [24]:
searcherII = SearcherIIndexVII("tiles.txt")
query = "3-0 and (中国美国)"
print(searcherII.parse_query(query))
print(searcherII.conv_query(query))
result = searcherII.search(query)
if result:
    for doc in result:
        display(HTML(doc))
else:
    print("No result")


['3-0', ' ', 'and', ' ', '(', '中国', '美国', ')']
self.term_match('3-0')   &   ( self.term_match('中国') & self.term_match('美国') )
