From 8c836a1a7a36f710b6d185dbddb521030afbf635 Mon Sep 17 00:00:00 2001
From: xurenlu <xurenlu@gmail.com>
Date: Sun, 13 Dec 2009 21:19:01 +0800
Subject: [PATCH] an stable edtion,version 0.6.10

---
 hyer/document.py | 27 +++++++++++++++++------
 hyer/misc.py     | 56 ++++++++++++++++++++++++++++--------------------
 hyer/spider.py   |  5 +++++
 hyerctl.py       |  2 ++
 webcrawler.py    | 29 +++++++++++++++++--------
 5 files changed, 81 insertions(+), 38 deletions(-)
diff --git a/hyer/document.py b/hyer/document.py
index b6d1b09..281b547 100644
--- a/hyer/document.py
+++ b/hyer/document.py
@@ -52,7 +52,7 @@ def __init__(self,content,uri=""):
         self["content"]=content	
         self.get_charset(self["content"])
         self.scan_links(self["content"])
-        self.parse_document_type(self["body"])
+        #self.parse_document_type(self["body"])
 
     def scan_links(self,content):
         """return all links in the html content
@@ -118,13 +118,19 @@ def get_base_meta(self,head):
     def get_charset(self,data):
         """鸟枪换炮了,用chardet来探测当前文档的encoding ,并自动换为UTF-8"""
         charset=chardet.detect(data)["encoding"]
-        if charset=="ascii":
+        if charset==None :
+            return 
+        
+        charset=charset.upper()
+        if charset=="ASCII":
             self["charset"]="UTF-8"
             pass
-        elif charset!="utf-8":
-            self["charset"]="UTF-8"
-            self["content"]=data.decode(charset).encode("UTF-8")
-
+        elif charset!="UTF-8":
+            if charset in ["UTF-8","UTF8","UTF-16","GBK","GB2312","GB18030"]:
+                self["content"]=data.decode(charset,"ignore").encode("UTF-8")
+                self["charset"]="UTF-8"
+        else:
+            self["charset"]=charset
     def get_charset_meta(self,head):
         """get the charset of document from the HTML head segment """
         try:
@@ -169,6 +175,15 @@ def textualize(self,body):
         """return the text without html tags"""
         text=body
         self["text"]=self._html2text(text)
+    def is_text_page(self,html):
+        """return if the page is mostly text"""
+        html_len=len(self["content"])
+        r=re.compile("<a[^>]*>.*?</a>",re.M|re.I|re.S)
+        unlinked_text=r.sub("",self["content"])
+        unlinked_text=self._html2text(unlinked_text)
+        words=len(unlinked_text)
+        return words/html_len
+
     def parse_document_type(self,html):
         """return the document type:hub,text,pic
         hub:document with many links ,little text and pics
diff --git a/hyer/misc.py b/hyer/misc.py
index ee2aa33..5dc85c8 100644
--- a/hyer/misc.py
+++ b/hyer/misc.py
@@ -2,8 +2,20 @@
 from __future__ import with_statement 
 import subprocess 
 import os 
+import string
+from random import Random
 #通用的调用外部程序的函数;
-def exec(args,html):
+
+def cmd25(args,html):
+    """在2.5中适用的方式"""
+    cmd=string.join(args)
+    (stdin,stdout)=os.popen2(cmd)
+    print >>stdin,html
+    out=string.join(stdout.readlines(),"\n")
+    stdin.close()
+    stdout.close()
+    return out
+def cmd(args,html):
     with os.tmpfile() as temp: 
         with open(os.devnull,"w" ) as null: 
             print >>temp,html 
@@ -15,30 +27,28 @@ def exec(args,html):
             stdout=subprocess.PIPE 
             ).communicate()[0] 
     return html
-
-#调用外部tidy来修正html
+def cmdmy(args,html):
+    """2.5的os.popen,2.6的subprocess都有问题,所以我只好自己写一个"""
+    cmd=string.join(args)
+    filen="/tmp/tmp.hyer.%d" % Random().randint(100000,1000000)
+    fileno=open(filen,"w+")
+    print >>fileno,html
+    fileno.close()
+    cmd=cmd % filen
+    pp=os.popen(cmd,"r")
+    lines=pp.readlines()
+    ret=string.join(lines ,"\n")
+    os.unlink(filen)
+    return ret
 def tidy(html): 
-    with os.tmpfile() as temp: 
-        with open(os.devnull,"w" ) as null: 
-            print >>temp,html 
-            temp.seek(0) 
-            html=subprocess.Popen( 
-            ["tidy", "-utf8","-asxhtml"], 
-            stdin=temp, 
-            stderr=null, 
-            stdout=subprocess.PIPE 
-            ).communicate()[0] 
-    return html
-#调用php版的取关键词的程序来取得关键词;
-def gettags(data):
-    cmd=["/usr/bin/php", "./bin/get_keywords.php"]
-    #cmd=["ls","/"]
-    with os.tmpfile() as temp:
-        print >>temp,data
-        temp.seek(0)
-        with open(os.devnull,"w") as null:
-            return subprocess.Popen(cmd, stdin=temp, stderr=null, stdout=subprocess.PIPE ).communicate()[0]
+    """#调用外部tidy来修正html"""
+    args= ["/usr/bin/tidy", "-utf8","-asxhtml"," %s"," 2>/dev/null"]
+    return cmdmy(args,html)
 
+def gettags(data):
+    """调用php版的取关键词的程序来取得关键词;"""
+    cmd=["/usr/bin/php", "./bin/get_keywords.php","-f %s","2>/dev/null"]
+    return cmdmy(cmd,data)
     #begin="<body>" 
     #return html[html.find(begin)+len(begin):html.rfind("</body>")].strip() 
 
diff --git a/hyer/spider.py b/hyer/spider.py
index e568406..36542f7 100644
--- a/hyer/spider.py
+++ b/hyer/spider.py
@@ -120,10 +120,14 @@ def run_loop(self):
         '''fetch tasks and finish it
         and exit when  there is no taks,'''
         go=True
+        k=0
         while(go):
             gc.disable() 
             go=self.run_single_fetch()
             gc.enable()
+            k=k+1
+            #if k > 10:
+            #    go=False
             
     def run_single_fetch(self):
         ''' fetch an url,parse it,save the links,documents ....
@@ -157,6 +161,7 @@ def run_single_fetch(self):
             return True 
         except Exception,er:
             return True
+        print "url downloaded:",url
         self.site_holder_monster.visited(uri)
         if content==None:
             self.logger.error("error occured when fetching an url %s:response is None" % url)
diff --git a/hyerctl.py b/hyerctl.py
index 5cb8ba5..5e22786 100755
--- a/hyerctl.py
+++ b/hyerctl.py
@@ -6,6 +6,8 @@
 sys.path.append('/usr/lib/python2.6/dist-packages/')
 sys.path.append("/var/lib/python-support/python2.5/")
 sys.path.append("/var/lib/python-support/python2.6/")
+sys.path.append("/usr/share/pyshared/")
+sys.path.append("/usr/lib/pymodules/python2.6/")
 import stackless,sys, os,atexit
 import sys,getopt
 import json
diff --git a/webcrawler.py b/webcrawler.py
index 8f023c2..83214d9 100755
--- a/webcrawler.py
+++ b/webcrawler.py
@@ -1,6 +1,10 @@
-#!/usr/local/bin/python.stackless -m profile
+#!/usr/bin/python
 # -*- coding: utf-8 -*-
+#!/usr/local/bin/python.stackless -m profile
 #================================================
+from __future__ import with_statement 
+import subprocess 
+import os 
 import sys
 sys.path.append('/usr/lib/python2.5/site-packages/')
 sys.path.append('/usr/lib/python2.6/dist-packages/')
@@ -18,6 +22,7 @@
 import signal, os,time,re
 import imp
 import shutil
+import chardet
 
 import hyer.document
 import hyer.browser
@@ -142,15 +147,16 @@ def usage():
 
 conf={
         "db_path":"./tmp/",
-        #"feed":"http://www.xinhuanet.com/newscenter/index.htm",
-        "feed":"http://localhost/htests/",
+        "feed":"http://www.xinhuanet.com/newscenter/index.htm",
+        #"feed":"http://localhost/htests/",
         "max_in_minute":60,
         "agent":"Mozilla/Firefox",
-        #"same_domain_regexps":[re.compile("http://www.xinhuanet.com/")],
-        "same_domain_regexps":[re.compile("http://localhost/htests/")],
+        "same_domain_regexps":[re.compile("http://www.xinhuanet.com/")],
+        #"same_domain_regexps":[re.compile("http://localhost/htests/")],
         "url_db":hyer.urldb.Urldb_mysql({"host":"localhost","user":"root","pass":"","db":"hyer"}),
         "task":"profiletest",
-        "leave_domain":False
+        "leave_domain":False,
+        "document":hyer.document.SimpleHTMLDocument
         }
 spider=hyer.spider.spider(conf)
 
@@ -160,22 +166,27 @@ def usage():
         "pass":"",
         "db":"hyer",
         "table":"xinhuall",
-        "fields":["url","body"]
+        "fields":["url","content","tags","charset"]
         }
 wdb=hyer.dbwriter.MySQLWriter(writerconf)
 
 def handle_new_doc(doc):
+    print "handle new doc:"
+    if doc["charset"]!="UTF-8":print "[notice] charset not utf8:",doc["charset"]
     doc["url"]=doc["URI"]
-    doc["body"]=doc["body"].decode("GBK").encode("UTF-8")
+    doc["tags"]=hyer.misc.gettags(doc["content"])
     wdb.run(doc)
 
 
 def start():
     spider=hyer.spider.spider(conf)
-    #hyer.event.add_event("new_document",handle_new_doc)
+    hyer.event.add_event("new_document",handle_new_doc)
     spider.run_loop()
 
 
+start()
+sys.exit(0)
+
 import cProfile
 cProfile.run("start()", "prof.txt")
 import pstats