From 8c836a1a7a36f710b6d185dbddb521030afbf635 Mon Sep 17 00:00:00 2001 From: xurenlu Date: Sun, 13 Dec 2009 21:19:01 +0800 Subject: [PATCH] an stable edtion,version 0.6.10 --- hyer/document.py | 27 +++++++++++++++++------ hyer/misc.py | 56 ++++++++++++++++++++++++++++-------------------- hyer/spider.py | 5 +++++ hyerctl.py | 2 ++ webcrawler.py | 29 +++++++++++++++++-------- 5 files changed, 81 insertions(+), 38 deletions(-) diff --git a/hyer/document.py b/hyer/document.py index b6d1b09..281b547 100644 --- a/hyer/document.py +++ b/hyer/document.py @@ -52,7 +52,7 @@ def __init__(self,content,uri=""): self["content"]=content self.get_charset(self["content"]) self.scan_links(self["content"]) - self.parse_document_type(self["body"]) + #self.parse_document_type(self["body"]) def scan_links(self,content): """return all links in the html content @@ -118,13 +118,19 @@ def get_base_meta(self,head): def get_charset(self,data): """鸟枪换炮了,用chardet来探测当前文档的encoding ,并自动换为UTF-8""" charset=chardet.detect(data)["encoding"] - if charset=="ascii": + if charset==None : + return + + charset=charset.upper() + if charset=="ASCII": self["charset"]="UTF-8" pass - elif charset!="utf-8": - self["charset"]="UTF-8" - self["content"]=data.decode(charset).encode("UTF-8") - + elif charset!="UTF-8": + if charset in ["UTF-8","UTF8","UTF-16","GBK","GB2312","GB18030"]: + self["content"]=data.decode(charset,"ignore").encode("UTF-8") + self["charset"]="UTF-8" + else: + self["charset"]=charset def get_charset_meta(self,head): """get the charset of document from the HTML head segment """ try: @@ -169,6 +175,15 @@ def textualize(self,body): """return the text without html tags""" text=body self["text"]=self._html2text(text) + def is_text_page(self,html): + """return if the page is mostly text""" + html_len=len(self["content"]) + r=re.compile("]*>.*?",re.M|re.I|re.S) + unlinked_text=r.sub("",self["content"]) + unlinked_text=self._html2text(unlinked_text) + words=len(unlinked_text) + return words/html_len + def parse_document_type(self,html): """return the document type:hub,text,pic hub:document with many links ,little text and pics diff --git a/hyer/misc.py b/hyer/misc.py index ee2aa33..5dc85c8 100644 --- a/hyer/misc.py +++ b/hyer/misc.py @@ -2,8 +2,20 @@ from __future__ import with_statement import subprocess import os +import string +from random import Random #通用的调用外部程序的函数; -def exec(args,html): + +def cmd25(args,html): + """在2.5中适用的方式""" + cmd=string.join(args) + (stdin,stdout)=os.popen2(cmd) + print >>stdin,html + out=string.join(stdout.readlines(),"\n") + stdin.close() + stdout.close() + return out +def cmd(args,html): with os.tmpfile() as temp: with open(os.devnull,"w" ) as null: print >>temp,html @@ -15,30 +27,28 @@ def exec(args,html): stdout=subprocess.PIPE ).communicate()[0] return html - -#调用外部tidy来修正html +def cmdmy(args,html): + """2.5的os.popen,2.6的subprocess都有问题,所以我只好自己写一个""" + cmd=string.join(args) + filen="/tmp/tmp.hyer.%d" % Random().randint(100000,1000000) + fileno=open(filen,"w+") + print >>fileno,html + fileno.close() + cmd=cmd % filen + pp=os.popen(cmd,"r") + lines=pp.readlines() + ret=string.join(lines ,"\n") + os.unlink(filen) + return ret def tidy(html): - with os.tmpfile() as temp: - with open(os.devnull,"w" ) as null: - print >>temp,html - temp.seek(0) - html=subprocess.Popen( - ["tidy", "-utf8","-asxhtml"], - stdin=temp, - stderr=null, - stdout=subprocess.PIPE - ).communicate()[0] - return html -#调用php版的取关键词的程序来取得关键词; -def gettags(data): - cmd=["/usr/bin/php", "./bin/get_keywords.php"] - #cmd=["ls","/"] - with os.tmpfile() as temp: - print >>temp,data - temp.seek(0) - with open(os.devnull,"w") as null: - return subprocess.Popen(cmd, stdin=temp, stderr=null, stdout=subprocess.PIPE ).communicate()[0] + """#调用外部tidy来修正html""" + args= ["/usr/bin/tidy", "-utf8","-asxhtml"," %s"," 2>/dev/null"] + return cmdmy(args,html) +def gettags(data): + """调用php版的取关键词的程序来取得关键词;""" + cmd=["/usr/bin/php", "./bin/get_keywords.php","-f %s","2>/dev/null"] + return cmdmy(cmd,data) #begin="" #return html[html.find(begin)+len(begin):html.rfind("")].strip() diff --git a/hyer/spider.py b/hyer/spider.py index e568406..36542f7 100644 --- a/hyer/spider.py +++ b/hyer/spider.py @@ -120,10 +120,14 @@ def run_loop(self): '''fetch tasks and finish it and exit when there is no taks,''' go=True + k=0 while(go): gc.disable() go=self.run_single_fetch() gc.enable() + k=k+1 + #if k > 10: + # go=False def run_single_fetch(self): ''' fetch an url,parse it,save the links,documents .... @@ -157,6 +161,7 @@ def run_single_fetch(self): return True except Exception,er: return True + print "url downloaded:",url self.site_holder_monster.visited(uri) if content==None: self.logger.error("error occured when fetching an url %s:response is None" % url) diff --git a/hyerctl.py b/hyerctl.py index 5cb8ba5..5e22786 100755 --- a/hyerctl.py +++ b/hyerctl.py @@ -6,6 +6,8 @@ sys.path.append('/usr/lib/python2.6/dist-packages/') sys.path.append("/var/lib/python-support/python2.5/") sys.path.append("/var/lib/python-support/python2.6/") +sys.path.append("/usr/share/pyshared/") +sys.path.append("/usr/lib/pymodules/python2.6/") import stackless,sys, os,atexit import sys,getopt import json diff --git a/webcrawler.py b/webcrawler.py index 8f023c2..83214d9 100755 --- a/webcrawler.py +++ b/webcrawler.py @@ -1,6 +1,10 @@ -#!/usr/local/bin/python.stackless -m profile +#!/usr/bin/python # -*- coding: utf-8 -*- +#!/usr/local/bin/python.stackless -m profile #================================================ +from __future__ import with_statement +import subprocess +import os import sys sys.path.append('/usr/lib/python2.5/site-packages/') sys.path.append('/usr/lib/python2.6/dist-packages/') @@ -18,6 +22,7 @@ import signal, os,time,re import imp import shutil +import chardet import hyer.document import hyer.browser @@ -142,15 +147,16 @@ def usage(): conf={ "db_path":"./tmp/", - #"feed":"http://www.xinhuanet.com/newscenter/index.htm", - "feed":"http://localhost/htests/", + "feed":"http://www.xinhuanet.com/newscenter/index.htm", + #"feed":"http://localhost/htests/", "max_in_minute":60, "agent":"Mozilla/Firefox", - #"same_domain_regexps":[re.compile("http://www.xinhuanet.com/")], - "same_domain_regexps":[re.compile("http://localhost/htests/")], + "same_domain_regexps":[re.compile("http://www.xinhuanet.com/")], + #"same_domain_regexps":[re.compile("http://localhost/htests/")], "url_db":hyer.urldb.Urldb_mysql({"host":"localhost","user":"root","pass":"","db":"hyer"}), "task":"profiletest", - "leave_domain":False + "leave_domain":False, + "document":hyer.document.SimpleHTMLDocument } spider=hyer.spider.spider(conf) @@ -160,22 +166,27 @@ def usage(): "pass":"", "db":"hyer", "table":"xinhuall", - "fields":["url","body"] + "fields":["url","content","tags","charset"] } wdb=hyer.dbwriter.MySQLWriter(writerconf) def handle_new_doc(doc): + print "handle new doc:" + if doc["charset"]!="UTF-8":print "[notice] charset not utf8:",doc["charset"] doc["url"]=doc["URI"] - doc["body"]=doc["body"].decode("GBK").encode("UTF-8") + doc["tags"]=hyer.misc.gettags(doc["content"]) wdb.run(doc) def start(): spider=hyer.spider.spider(conf) - #hyer.event.add_event("new_document",handle_new_doc) + hyer.event.add_event("new_document",handle_new_doc) spider.run_loop() +start() +sys.exit(0) + import cProfile cProfile.run("start()", "prof.txt") import pstats