# 小型搜索引擎

In [72]:
import re
import hashlib
from dataclasses import dataclass, field


class BloomFilter:
    def __init__(self, bloom_filter=None, length=0):

        self.BIT_MAX = 10
        
        if bloom_filter:
            self._bloom_filter = bloom_filter
            self.LENGTH = len(bloom_filter)
        else:
            self.LENGTH = length // self.BIT_MAX
            self._bloom_filter = [0 for x in range(self.LENGTH)]
            
    def note_url(self, url):
        self._set_hash_1_in_bloom_filter(url)
        self._set_hash_2_in_bloom_filter(url)
        
    def is_visited(self, url):
        result1 = self._get_hash_1_in_bloom_filter(url)
        result2 = self._get_hash_2_in_bloom_filter(url)  
        return result1 and result2

    def _hash_1(self, url):
        md5 = hashlib.md5()
        md5.update('www.test.com/0'.encode('utf-8'))
        return hash(md5.hexdigest()) % (self.BIT_MAX * self.LENGTH)

    def _hash_2(self, url):
        return hash(url) % (self.BIT_MAX * self.LENGTH)
    
    def _get_hash_1_in_bloom_filter(self, url):
        hash_ = self._hash_1(url)
        index = hash_ // self.BIT_MAX
        bit = hash_ % self.BIT_MAX
        return bool(self._bloom_filter[index] >> bit)
    
    def _get_hash_2_in_bloom_filter(self, url):
        hash_ = self._hash_2(url)
        index = hash_ // self.BIT_MAX
        bit = hash_ % self.BIT_MAX
        return bool(self._bloom_filter[index] >> bit)
    
    def _set_hash_1_in_bloom_filter(self, url):
        hash_ = self._hash_1(url)
        index = hash_ // self.BIT_MAX
        bit = hash_ % self.BIT_MAX
        self._bloom_filter[index] += 2**bit
    
    def _set_hash_2_in_bloom_filter(self, url):
        hash_ = self._hash_2(url)
        index = hash_  // self.BIT_MAX
        bit = hash_ % self.BIT_MAX
        self._bloom_filter[index] += 2**bit

        
@dataclass
class Link:
    ''' 属性是和url相关的一些数据，提供关于html一些操作 '''
    id: str
    url: str
    body: str= field(repr=False, init=False)
    
    def get(self):
        ''' 爬取网页
        
        实际代码应该在这里爬取网页并且保存html文本
        '''
        self.body = '<p>test' + str(self.id) + "</p><p>t</p>"
        self.body += '<script>balabala</script>'

    def filter_words(self):
        ''' 过滤html，获取词列表 '''
        body = self.body
        body = Link._moved_scrpit(body)
        body = Link._moved_tag(body)
        result = Link._split(body)
        return result
    
    def get_links(self):
        ''' 提取跳转链接 '''
        # TODO:  提取跳转链接
        return []
        
    @staticmethod
    def _moved_scrpit(body):
        ''' 去掉js、css、下拉框等用户看不到的内容 '''
        _body = re.sub(r'<script>.*</script>', '', body)
        return _body
    
    @staticmethod
    def _moved_tag(body):
        ''' 去掉html标签 '''
        _body = re.sub(r'</?\w*>?', '\t', body)
        return _body
    
    @staticmethod
    def _split(body):
        ''' 分词 '''
        return set([x for x in body.split('\t') if x])

@dataclass
class Index:
    word_id: int
    urls: list = field(default_factory=list)
    
    def load(self, string):
        word_id, urls = string.split('\t')
        urls = urls.split(',')
        self.word_id = word_id
        self.urls = urls
    
    def to_inverted_index(self):
        return '%s\t%s' % (self.word_id, ','.join(self.urls))

    
# 这四个变量可以存到文件里
links = ['www.test.com/' + str(x) for x in range(10)]  # 待爬取的网页url
bloom_filter = 0  # 用来判重的bloom过滤器
doc_raw = ''  # 保存已爬取的网页数据，实际上不应该放在内存，由于比较大应该保存到文件
doc_ids = {}  # 网页id: 网页url

BLOOM_LENGTH = 10240  # 布隆过滤器位数
_url_id = 0  # url id 计数器
def new_url_id():
    global _url_id
    _id = _url_id
    _url_id += 1
    return _id


# 初始化布隆过滤器
bloom = BloomFilter(length=BLOOM_LENGTH)    
bloom.note_url('www.test.com/0')

# 倒排索引
inverted_index = ['0\twww.1.com,www.2.com']
# 词和id对应关系
word_ids = {}
# 词id和在倒排索引中的位置对应关系
word_id_position = {1: 0}
# 词id
_word_id = 1
def new_word_id():
    global _word_id
    _id = _word_id
    _word_id += 1
    return _id


# 模拟10个待爬数据
for url in links:
    id_ = new_url_id()
    link = Link(id=id_, url=url)
    
    print(link)
    #  判重
    print(  '- 未爬取？-->', bloom.is_visited(link.url) is False)
    if bloom.is_visited(link.url) is False:    
        # 模拟爬虫
        link.get()
        # 在爬取到的html里面提取跳转链接，然后塞到待爬去的列表中
        _links = link.get_links()
        links.extend(_links)
        # 记录已爬的url
        bloom.note_url(url)
        # 记录网站id和url对应关系
        doc_ids[id_] = url
        # 记录body
        doc_raw += str(id_) + '\t' + link.body + '\n\n\n'

        # 获取过滤后的词列表
        words = link.filter_words()
        print('- 过滤、分词 -->', words)
        # 创建索引
        for word in words:
            if word_ids.get(word, False) == False:
                # 记录word对应的id
                word_ids[word] = new_word_id()
            word_id = word_ids[word]
            # 创建索引
            if word_id_position.get(word_id, None) != None: 
                row = word_id_position[word_id]
                _inverted_index = inverted_index[row]
                index = Index(word_id=word_id)
                index.load(_inverted_index)
                index.urls.append(link.url)
                inverted_index[row] = index.to_inverted_index()
            else:
                row = len(inverted_index)
                index = Index(word_id=word_id)
                index.urls.append(link.url)
                inverted_index.append(index.to_inverted_index())
                word_id_position[word_id] = row
    print('-'*20)


# 查询

test_word = 'test1'
# 根据某个词获取词id
test_word_id = word_ids[test_word]
# 根据词id获取词id在倒排索引中的位置
test_row = word_id_position[test_word_id]
# 根据词id在倒排索引中的位置快速获取到词对应的网页编号列表
_, test_urls = inverted_index[test_row].split('\t')
test_urls = test_urls.split(',')
print(test_urls)
# 根据网页编号列表获取url列表，返回给用户

Link(id=0, url='www.test.com/0')
- 未爬取？--> False
--------------------
Link(id=1, url='www.test.com/1')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test1'}
--------------------
Link(id=2, url='www.test.com/2')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test2'}
--------------------
Link(id=3, url='www.test.com/3')
- 未爬取？--> True
- 过滤、分词 --> {'test3', 't'}
--------------------
Link(id=4, url='www.test.com/4')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test4'}
--------------------
Link(id=5, url='www.test.com/5')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test5'}
--------------------
Link(id=6, url='www.test.com/6')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test6'}
--------------------
Link(id=7, url='www.test.com/7')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test7'}
--------------------
Link(id=8, url='www.test.com/8')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test8'}
--------------------
Link(id=9, url='www.test.com/9')
- 未爬取？--> True
- 过滤、分词 --> {'t', 'test9'}
--------------------
['www.test.com/1']


# 参考资料

- [极客时间-数据结构与算法之美-剖析搜索引擎背后的经典数据结构和算法](https://time.geekbang.org/column/article/79433)