Skip to content
This repository has been archived by the owner on Sep 5, 2022. It is now read-only.

Commit

Permalink
an stable edtion,version 0.6.10
Browse files Browse the repository at this point in the history
  • Loading branch information
xurenlu committed Dec 13, 2009
1 parent 1545d6e commit 8c836a1
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 38 deletions.
27 changes: 21 additions & 6 deletions hyer/document.py
Expand Up @@ -52,7 +52,7 @@ def __init__(self,content,uri=""):
self["content"]=content
self.get_charset(self["content"])
self.scan_links(self["content"])
self.parse_document_type(self["body"])
#self.parse_document_type(self["body"])

def scan_links(self,content):
"""return all links in the html content
Expand Down Expand Up @@ -118,13 +118,19 @@ def get_base_meta(self,head):
def get_charset(self,data):
"""鸟枪换炮了,用chardet来探测当前文档的encoding ,并自动换为UTF-8"""
charset=chardet.detect(data)["encoding"]
if charset=="ascii":
if charset==None :
return

charset=charset.upper()
if charset=="ASCII":
self["charset"]="UTF-8"
pass
elif charset!="utf-8":
self["charset"]="UTF-8"
self["content"]=data.decode(charset).encode("UTF-8")

elif charset!="UTF-8":
if charset in ["UTF-8","UTF8","UTF-16","GBK","GB2312","GB18030"]:
self["content"]=data.decode(charset,"ignore").encode("UTF-8")
self["charset"]="UTF-8"
else:
self["charset"]=charset
def get_charset_meta(self,head):
"""get the charset of document from the HTML head segment """
try:
Expand Down Expand Up @@ -169,6 +175,15 @@ def textualize(self,body):
"""return the text without html tags"""
text=body
self["text"]=self._html2text(text)
def is_text_page(self,html):
"""return if the page is mostly text"""
html_len=len(self["content"])
r=re.compile("<a[^>]*>.*?</a>",re.M|re.I|re.S)
unlinked_text=r.sub("",self["content"])
unlinked_text=self._html2text(unlinked_text)
words=len(unlinked_text)
return words/html_len

def parse_document_type(self,html):
"""return the document type:hub,text,pic
hub:document with many links ,little text and pics
Expand Down
56 changes: 33 additions & 23 deletions hyer/misc.py
Expand Up @@ -2,8 +2,20 @@
from __future__ import with_statement
import subprocess
import os
import string
from random import Random
#通用的调用外部程序的函数;
def exec(args,html):

def cmd25(args,html):
"""在2.5中适用的方式"""
cmd=string.join(args)
(stdin,stdout)=os.popen2(cmd)
print >>stdin,html
out=string.join(stdout.readlines(),"\n")
stdin.close()
stdout.close()
return out
def cmd(args,html):
with os.tmpfile() as temp:
with open(os.devnull,"w" ) as null:
print >>temp,html
Expand All @@ -15,30 +27,28 @@ def exec(args,html):
stdout=subprocess.PIPE
).communicate()[0]
return html

#调用外部tidy来修正html
def cmdmy(args,html):
"""2.5的os.popen,2.6的subprocess都有问题,所以我只好自己写一个"""
cmd=string.join(args)
filen="/tmp/tmp.hyer.%d" % Random().randint(100000,1000000)
fileno=open(filen,"w+")
print >>fileno,html
fileno.close()
cmd=cmd % filen
pp=os.popen(cmd,"r")
lines=pp.readlines()
ret=string.join(lines ,"\n")
os.unlink(filen)
return ret
def tidy(html):
with os.tmpfile() as temp:
with open(os.devnull,"w" ) as null:
print >>temp,html
temp.seek(0)
html=subprocess.Popen(
["tidy", "-utf8","-asxhtml"],
stdin=temp,
stderr=null,
stdout=subprocess.PIPE
).communicate()[0]
return html
#调用php版的取关键词的程序来取得关键词;
def gettags(data):
cmd=["/usr/bin/php", "./bin/get_keywords.php"]
#cmd=["ls","/"]
with os.tmpfile() as temp:
print >>temp,data
temp.seek(0)
with open(os.devnull,"w") as null:
return subprocess.Popen(cmd, stdin=temp, stderr=null, stdout=subprocess.PIPE ).communicate()[0]
"""#调用外部tidy来修正html"""
args= ["/usr/bin/tidy", "-utf8","-asxhtml"," %s"," 2>/dev/null"]
return cmdmy(args,html)

def gettags(data):
"""调用php版的取关键词的程序来取得关键词;"""
cmd=["/usr/bin/php", "./bin/get_keywords.php","-f %s","2>/dev/null"]
return cmdmy(cmd,data)
#begin="<body>"
#return html[html.find(begin)+len(begin):html.rfind("</body>")].strip()

Expand Down
5 changes: 5 additions & 0 deletions hyer/spider.py
Expand Up @@ -120,10 +120,14 @@ def run_loop(self):
'''fetch tasks and finish it
and exit when there is no taks,'''
go=True
k=0
while(go):
gc.disable()
go=self.run_single_fetch()
gc.enable()
k=k+1
#if k > 10:
# go=False

def run_single_fetch(self):
''' fetch an url,parse it,save the links,documents ....
Expand Down Expand Up @@ -157,6 +161,7 @@ def run_single_fetch(self):
return True
except Exception,er:
return True
print "url downloaded:",url
self.site_holder_monster.visited(uri)
if content==None:
self.logger.error("error occured when fetching an url %s:response is None" % url)
Expand Down
2 changes: 2 additions & 0 deletions hyerctl.py
Expand Up @@ -6,6 +6,8 @@
sys.path.append('/usr/lib/python2.6/dist-packages/')
sys.path.append("/var/lib/python-support/python2.5/")
sys.path.append("/var/lib/python-support/python2.6/")
sys.path.append("/usr/share/pyshared/")
sys.path.append("/usr/lib/pymodules/python2.6/")
import stackless,sys, os,atexit
import sys,getopt
import json
Expand Down
29 changes: 20 additions & 9 deletions webcrawler.py
@@ -1,6 +1,10 @@
#!/usr/local/bin/python.stackless -m profile
#!/usr/bin/python
# -*- coding: utf-8 -*-
#!/usr/local/bin/python.stackless -m profile
#================================================
from __future__ import with_statement
import subprocess
import os
import sys
sys.path.append('/usr/lib/python2.5/site-packages/')
sys.path.append('/usr/lib/python2.6/dist-packages/')
Expand All @@ -18,6 +22,7 @@
import signal, os,time,re
import imp
import shutil
import chardet

import hyer.document
import hyer.browser
Expand Down Expand Up @@ -142,15 +147,16 @@ def usage():

conf={
"db_path":"./tmp/",
#"feed":"http://www.xinhuanet.com/newscenter/index.htm",
"feed":"http://localhost/htests/",
"feed":"http://www.xinhuanet.com/newscenter/index.htm",
#"feed":"http://localhost/htests/",
"max_in_minute":60,
"agent":"Mozilla/Firefox",
#"same_domain_regexps":[re.compile("http://www.xinhuanet.com/")],
"same_domain_regexps":[re.compile("http://localhost/htests/")],
"same_domain_regexps":[re.compile("http://www.xinhuanet.com/")],
#"same_domain_regexps":[re.compile("http://localhost/htests/")],
"url_db":hyer.urldb.Urldb_mysql({"host":"localhost","user":"root","pass":"","db":"hyer"}),
"task":"profiletest",
"leave_domain":False
"leave_domain":False,
"document":hyer.document.SimpleHTMLDocument
}
spider=hyer.spider.spider(conf)

Expand All @@ -160,22 +166,27 @@ def usage():
"pass":"",
"db":"hyer",
"table":"xinhuall",
"fields":["url","body"]
"fields":["url","content","tags","charset"]
}
wdb=hyer.dbwriter.MySQLWriter(writerconf)

def handle_new_doc(doc):
print "handle new doc:"
if doc["charset"]!="UTF-8":print "[notice] charset not utf8:",doc["charset"]
doc["url"]=doc["URI"]
doc["body"]=doc["body"].decode("GBK").encode("UTF-8")
doc["tags"]=hyer.misc.gettags(doc["content"])
wdb.run(doc)


def start():
spider=hyer.spider.spider(conf)
#hyer.event.add_event("new_document",handle_new_doc)
hyer.event.add_event("new_document",handle_new_doc)
spider.run_loop()


start()
sys.exit(0)

import cProfile
cProfile.run("start()", "prof.txt")
import pstats
Expand Down

0 comments on commit 8c836a1

Please sign in to comment.