|
1 | 1 | #!/usr/bin/env python |
2 | 2 | from BeautifulSoup import NavigableString |
3 | 3 | from page_parser import parse |
| 4 | +import logging |
4 | 5 | import re |
5 | 6 |
|
6 | 7 | REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I), |
@@ -32,28 +33,32 @@ def __init__(self, input, **options): |
32 | 33 | self.options = defaultdict(lambda: None) |
33 | 34 | for k, v in options.items(): |
34 | 35 | self.options[k] = v |
35 | | - self.make_html() |
36 | 36 |
|
37 | 37 | def make_html(self): |
38 | 38 | self.html = parse(self.input, self.options['url']) |
39 | 39 |
|
40 | | - def content(self, remove_unlikely_candidates = True): |
41 | | - def remove(tag): [i.extract() for i in self.html.findAll(tag)] |
42 | | - remove('script') |
43 | | - remove('style') |
44 | | - |
45 | | - if remove_unlikely_candidates: self.remove_unlikely_candidates() |
46 | | - self.transform_misused_divs_into_paragraphs() |
47 | | - candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) |
48 | | - best_candidate = self.select_best_candidate(candidates) |
49 | | - article = self.get_article(candidates, best_candidate) |
50 | | - |
51 | | - cleaned_article = self.sanitize(article, candidates) |
52 | | - if remove_unlikely_candidates and len(cleaned_article or '') < (self.options['retry_length'] or self.RETRY_LENGTH): |
| 40 | + def content(self): |
| 41 | + ruthless = True |
| 42 | + while True: |
53 | 43 | self.make_html() |
54 | | - return self.content(False) |
55 | | - else: |
56 | | - return cleaned_article |
| 44 | + [i.extract() for i in self.tags(self.html, 'script', 'style')] |
| 45 | + |
| 46 | + if ruthless: self.remove_unlikely_candidates() |
| 47 | + self.transform_misused_divs_into_paragraphs() |
| 48 | + candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) |
| 49 | + best_candidate = self.select_best_candidate(candidates) |
| 50 | + if ruthless and best_candidate is None: |
| 51 | + ruthless = False |
| 52 | + continue |
| 53 | + article = self.get_article(candidates, best_candidate) |
| 54 | + |
| 55 | + cleaned_article = self.sanitize(article, candidates) |
| 56 | + of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH) |
| 57 | + if ruthless and not of_acceptable_length: |
| 58 | + ruthless = False |
| 59 | + continue # try again |
| 60 | + else: |
| 61 | + return cleaned_article |
57 | 62 |
|
58 | 63 | def get_article(self, candidates, best_candidate): |
59 | 64 | # Now that we have the top candidate, look through its siblings for content that might also be related. |
@@ -87,18 +92,13 @@ def get_article(self, candidates, best_candidate): |
87 | 92 |
|
88 | 93 | def select_best_candidate(self, candidates): |
89 | 94 | sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) |
90 | | - |
91 | 95 | self.debug("Top 5 canidates:") |
92 | 96 | for candidate in sorted_candidates[:5]: |
93 | 97 | elem = candidate['elem'] |
94 | | - self.debug("Candidate %s with score %s" % ( |
95 | | - describe(elem), candidate['content_score'])) |
| 98 | + self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score'])) |
96 | 99 |
|
97 | 100 | best_candidate = sorted_candidates[0] if len(sorted_candidates) > 1 else { 'elem': self.html.find("body"), 'content_score': 0 } |
98 | | - elem = best_candidate['elem'] |
99 | | - self.debug("Best candidate %s#%s.%s with score %s" % ( |
100 | | - elem.name, elem.get('id',''), elem.get('class',''), best_candidate['content_score'])) |
101 | | - |
| 101 | + self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score'])) |
102 | 102 | return best_candidate |
103 | 103 |
|
104 | 104 | def get_link_density(self, elem): |
@@ -173,9 +173,9 @@ def score_node(self, elem): |
173 | 173 | content_score -= 5 |
174 | 174 | return { 'content_score': content_score, 'elem': elem } |
175 | 175 |
|
176 | | - def debug(self, str): |
| 176 | + def debug(self, *a): |
177 | 177 | if self.options['debug']: |
178 | | - print(str) |
| 178 | + logging.debug(*a) |
179 | 179 |
|
180 | 180 | def remove_unlikely_candidates(self): |
181 | 181 | for elem in self.html.findAll(): |
|
0 commit comments