Permalink
Browse files

fixed that title appears very behind & version 4.

  • Loading branch information...
1 parent 6d80491 commit c23158fb3e161d5546193a855055d6d40b8c61a2 @ydoovv ydoovv committed Dec 5, 2010
Showing with 17 additions and 6 deletions.
  1. +1 −1 app.yaml
  2. +16 −5 freejump.py
View
@@ -1,5 +1,5 @@
application: digest-favs
-version: 3
+version: 4
runtime: python
api_version: 1
View
@@ -4,6 +4,7 @@
from urllib2 import urlopen
from urllib import quote as urlquote
import re
+import logging
# Free jump the shorten url to the original long url
# Many shortend url services like bit.ly, j.mp are bloody damn
@@ -26,15 +27,25 @@ def get_actual_url(self, short_url):
def get_title(self, url):
"""Get the title of the url."""
- f = urlopen(url)
- for i in range(4): # no 'title' tag in the beginning 4k of a page? WTF
+ try:
+ f = urlopen(url)
+ except Exception, reason:
+ logging.error(str(reason))
+ return str(reason)
+
+ html_head = ''
+ while True:
res = f.read(1024)
- title_raw = re.findall('<title>.*</title>', res, flags=re.DOTALL)
- if title_raw: break
+ html_head += res
+ if not res or html_head.find('</head>') != -1: break
- if i == 3: return "No title found in your web page..."
+ title_raw = re.findall('<title>.*</title>', html_head, flags=re.DOTALL)
title = title_raw[0][7:-8].strip() # strip 'title' tag & '\n', '\t'...
+ if not title:
+ logging.error('NO_TITLE: %s' % url)
+ return 'No title found in your web page...'
+
for enc in ("utf-8", "gbk", "big5"):
try:
t = title.decode(enc)

0 comments on commit c23158f

Please sign in to comment.