In [52]:
from urllib import request
from bs4 import BeautifulSoup
import re
import jieba
from collections import defaultdict
import pymysql

In [55]:
class crawler:
    def __init__(self, dbname):
        self.word_url = defaultdict(set)
        self.connect = pymysql.connect(host="localhost", user="root", password="root", charset="utf8mb4", db="searchengine")
        self.cursor = self.connect.cursor()
        
    def __del__(self):
        self.connect.close()
    
    def dbcommit(self):
        self.connect.commit()
    
    def addtoindex(self,url,soup):            
        urlid = self.getentryid('urllist','url',url)
        
        text = self.gettextonly(soup)
        words = self.separatewords(text)
        for i,word in enumerate(words):
            # self.word_url[word].add(url)
            wordid = self.getentryid('wordlist','word',word)
            self.cursor.execute('insert into wordlocation(urlid,wordid,location) values (%s,%s,%s)', (urlid, wordid, i))
    
    def getentryid(self, table, field, value, createnew=True):
        self.cursor.execute("select id from %s" % table +" where %s" % field + " = %s", (value))
        res = self.cursor.fetchone()
        if res == None:
            self.cursor.execute("insert into %s" % table +" (%s " % field +") values (%s)", (value))
            self.dbcommit()
            return self.cursor.lastrowid
        else:
            return res[0]
    
    def gettextonly(self, soup):
        # 直接用下面代码处理有问题
        if soup.body:
            return soup.body.get_text().strip()
        else:
            return soup.get_text().strip()
        '''
        v = soup.string
        if v == None:
            c = soup.contents
            resulttext = ''
            for t in c:
                subtext = self.gettextonly(t)
                resulttext += subtext + '\n'
            return resulttext
        else:
            return v.strip()
        return v
        '''
    
    def separatewords(self,words):
        pattern = r'[a-zA-Z0-9\’!\"#$%&\'()*+,（,）,-\.\/\:;<=>?@，。?★、…【】《》？“”‘’！\[\]^_`{|}~\\\s]+'
        return [word.strip() for word in jieba.lcut(words) if not re.search(pattern, str(word))]
    
    def isindexed(self, url):
#         self.cursor.execute('select * from urllist where url = %s',url)
#         find = self.cursor.fetchone()
#         if find != None:
#             return find[0]
        return False
    def addlinkref(self, urlFrom, urlTo, linkText):
        self.cursor.execute('insert into link (fromid, toid) values(%s,%s)', (self.getentryid('urllist','url',urlFrom), self.getentryid('urllist','url',urlTo)))
        self.dbcommit()
        
    def crawl(self, pages, depth=4):
        for i in range(depth):
            newpages = set()
            for page in pages:
                try:
                    c = request.urlopen(page)
                except:
                    print("Can't open %s" % page)
                    continue
                # 获取当前页面的html内容
                soup = BeautifulSoup(c.read(),'html')
                self.addtoindex(page, soup)
                
                links = soup('a')
                for link in links:
                    if 'href' in dict(link.attrs):
                        url = request.urljoin(page, link['href'])
                        newpages.add(url)
                        linkText = self.gettextonly(link)
                        self.addlinkref(page, url, linkText)
                    
            pages = newpages
            
    def createindextables(self):
        pass

In [56]:
pagelist = ['http://m.judarhr.com']
crawler = crawler('')
crawler.crawl(pagelist)

Can't open javascript:;
Can't open http://www.beian.miit.gov.cn/
Can't open javascript:void(0);
Can't open http://www.beian.miit.gov.cn/
Can't open javascript:void(0)


In [72]:
from collections import defaultdict
class searcher:
    def __init__(self):
        self.connect = pymysql.connect(host="localhost", user="root", password="root", charset="utf8mb4", db="searchengine")
        self.cursor = self.connect.cursor()
        
    def __del__(self):
        self.connect.close()
        
    def getmatches(self, q):
        wordids = []
        self.cursor.execute('select id from wordlist where word = %s',q)
        wordrow = self.cursor.fetchone()
        wordid = wordrow[0]
        wordids.append(wordid)
        
        self.cursor.execute('select urlid,location from wordlocation where wordid = %s',wordid)
        cur = self.cursor.fetchall()
        # [(urlid,location)]
        rows = [row for row in cur]
            
        return rows,wordids
    
    def normalizescores(self,scores,smallisbetter=False):
        vsmall = 0.0001
        if smallisbetter:
            minscore = min(scores.values())
            return dict([(url, minscore / max(float(score),vsmall)) for url,score in scores.items()])
        else:
            maxscore = max(scores.values())
            if maxscore == 0:
                maxscore = vsmall
            return dict([(url,float(score) / maxscore) for url,score in scores.items()])
    
    def getscoredlist(self, rows, wordids):

        # result = self.frequencyscore(rows)
        # result = self.locationscore(rows)
        result = self.inboundlinkscore(rows)
        return result
    
    # 按关键词单词频率
    def frequencyscore(self, rows):
        counts = dict([(row[0],0) for row in rows])
        for row in rows:
            counts[row[0]] += 1
        return self.normalizescores(counts)
    # 按关键词在文档中的位置前后
    # 位置靠前代表越重要
    def locationscore(self, rows):
        locations = dict([(row[0],float('inf')) for row in rows])
        for row in rows:
            location_sum = sum(row[1:])
            if location_sum < locations[row[0]]:
                locations[row[0]] = location_sum
        return self.normalizescores(locations,smallisbetter=True)
    
    # 按链接的入度关系数量
    def inboundlinkscore(self, rows):
        uniqueurlids = set([row[0] for row in rows])
        res = {}
        for urlid in uniqueurlids:
            self.cursor.execute('select count(*) from link where toid = %s', urlid)
            result = self.cursor.fetchone()
            res[urlid] = result[0]
        return self.normalizescores(res)
        
    
    def geturltext(self, id):
        self.cursor.execute('select url from urllist where id = %s', id)
        id = self.cursor.fetchone()[0]
        return id
    
    def query(self,q):
        rows,wordids = self.getmatches(q)
        scores = self.getscoredlist(rows, wordids)
        result = sorted([(score,self.geturltext(urlid)) for urlid,score in scores.items()],key=lambda x:x[0],reverse=True)
        return result
            
    def show(self,q):
        words = jieba.lcut(q)
        result = set()
        for word in words:
            self.cursor.execute('select id from wordlist where word = %s',(word))
            wordid = self.cursor.fetchone()[0]
            self.cursor.execute('select urlid from wordlocation where wordid = %s',(wordid))
            urlids = set(self.cursor.fetchall())
            for urlid in urlids:
                result.add(urlid[0])
        
        for id in result:
            self.cursor.execute('select * from urllist where id = %s',(id))
            url = self.cursor.fetchone()
            print(url)
        
s = searcher()
# rows, wordids = s.getmatches('在线')
# print(rows, wordids)
# print()
# s.getscoredlist(rows, wordids)
s.query('招聘会')

[(1.0, 'http://www.judarhr.com/index/index/contact.html'),
 (0.8947368421052632, 'http://www.judarhr.com/index/index/policy.html'),
 (0.8245614035087719, 'http://www.judarhr.com/index/index/index.html'),
 (0.8245614035087719, 'http://www.judarhr.com/index/index/recurit.html'),
 (0.2631578947368421, 'http://www.judarhr.com/index/index/policy.html?page=2'),
 (0.22807017543859648,
  'http://www.judarhr.com/index/index/recurit.html?page=2'),
 (0.22807017543859648,
  'http://www.judarhr.com/index/index/policy.html?page=22'),
 (0.22807017543859648,
  'http://www.judarhr.com/index/index/policy.html?page=23'),
 (0.21052631578947367,
  'http://www.judarhr.com/index/index/policy.html?page=5'),
 (0.21052631578947367,
  'http://www.judarhr.com/index/index/policy.html?page=6'),
 (0.21052631578947367,
  'http://www.judarhr.com/index/index/policy.html?page=7'),
 (0.19298245614035087,
  'http://www.judarhr.com/index/index/recurit.html?page=25'),
 (0.19298245614035087,
  'http://www.judarhr.com/index/i