这一节,我们以搜索引擎为背景,做一个面对对象的搜索引擎,来巩固面对对象思想

一个搜索引擎由搜索器,索引器,检索器和用户接口组成

#### 定义SearchEngineBase基类

In [1]:
class SearchEngineBase(object):
    def __init__(self):
        pass
    
    # 负责读取文件内容,将文件路径作为id,连同内容一起发送给process_corpus中
    def add_corpus(self, file_path):
        with open(file_path, 'r') as fin:
            text = fin.read()
        self.process_corpus(file_path, text)
    
    #需要对内容进行处理,然后文件路径为id,将处理后的内容保存下来,
    def process_corpus(self, id, text):
        raise Exception('process_corpus not implement')
    
    #查询
    def search(self,query):
        raise Exception('search not implement')
    
def main(search_engine):
    for file_path in ['../resources/1.txt', '../resources/2.txt', '../resources/3.txt', '../resources/4.txt', '../resources/5.txt']:
        search_engine.add_corpus(file_path)
        
    while True:
        query = input()
        if query == 'quit':
            break
        results = search_engine.search(query)
        print('found {} result(s):'.format(len(results)))
        for result in results:
            print(result)

In [2]:
class SimpleEngine(SearchEngineBase):
    def __init__(self):
        #下面是py2的写法
        super(SimpleEngine, self).__init__()
        self.__id_to_texts = {}
        
    def process_corpus(self, id, text):
        self.__id_to_texts[id] = text
    
    def search(self, query):
        results = []
        for id, text in self.__id_to_texts.items():
            if query in text:
                results.append(id)
        return results
    
search_engine = SimpleEngine()
main(search_engine)

simple
found 0 result(s):
quit


### Bag of Worlds 和 Inverted Index

In [3]:
import re

In [4]:
class BOWEngine(SearchEngineBase):
    def __init__(self):
        super(BOWEngine, self).__init__()
        self.__id_to_words = {}
 
    def process_corpus(self, id, text):
        self.__id_to_words[id] = self.parse_text_to_words(text)
 
    def search(self, query):
        query_words = self.parse_text_to_words(query)
        results = []
        for id, words in self.__id_to_words.items():
            if self.query_match(query_words, words):
                results.append(id)
        return results
    
    #用户输入的查询单词都在我的集合中
    @staticmethod
    def query_match(query_words, words):
        for query_word in query_words:
            if query_word not in words:
                return False
        return True
 
    @staticmethod
    def parse_text_to_words(text):
        # 使用正则表达式去除标点符号和换行符
        text = re.sub(r'[^\w ]', ' ', text)
        # 转为小写
        text = text.lower()
        # 生成所有单词的列表
        word_list = text.split(' ')
        # 去除空白单词
        word_list = filter(None, word_list)
        # 返回单词的 set
        return set(word_list)

In [5]:
search_engine=BOWEngine()
main(search_engine)

i have a dream
found 3 result(s):
../resources/1.txt
../resources/2.txt
../resources/3.txt
quit


In [None]:
#没更新完这部分