Skip to content

Commit

Permalink
Updated generic title parsing, tests
Browse files Browse the repository at this point in the history
  • Loading branch information
bobheadxi committed Oct 23, 2017
1 parent 36cb5dd commit 117c081
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ def parse_generic_item(response, children):
title = utils.extract_element(response.xpath("//title/text()"), 0)
titles = title.split('|')
if len(titles) == 2:
title = utils.strip_content(titles[0])
site_title = utils.strip_content(titles[1])
title = titles[0].strip()
site_title = titles[1].strip()
desc = utils.extract_element(response.xpath("//meta[@name='description']/@content"), 0)
raw_content = utils.strip_content(response.body)

Expand Down
2 changes: 1 addition & 1 deletion sleuth_crawler/tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ def test_parse_generic_item(self, fake_parser):
"""
Test crawler's redirect to generic_page_parser as default parser
"""
response = mock_response()
response = mock_response(file_name='/test_data/ubc.txt')
self.spider.parse_generic_item(response)
self.assertTrue(fake_parser.called)
6 changes: 3 additions & 3 deletions sleuth_crawler/tests/test_data/ubc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
<!--[[if (gt IE 9)|(gt IEMobile 7)]><!--><html lang="en"><!--<![endif]-->
<head>
<meta charset="utf-8">
<title>The University of British Columbia</title>
<title>Homepage | The University of British Columbia</title>
<meta name="viewport" content="width=device-width">
<meta name="description" content="The University of British Columbia is a global centre for research and teaching, consistently ranked among the top 20 public universities in the world.">
<meta property="fb:pages" content="16761458703">
<!-- Stylesheets -->
<link href="//cdn.ubc.ca/clf/7.0.5/css/ubc-clf-full-bw.min.css" rel="stylesheet">
<link rel="stylesheet" type="text/css" href="//cloud.typography.com/6804272/781004/css/fonts.css" />
<link href='/_assets/css/style.min.css?0' rel='stylesheet'>
<link href='/_assets/css/style.min.css?0' rel='stylesheet'>
<!--[if lte IE 7]>
<link href="https://cdn.ubc.ca/clf/7.0.5/css/font-awesome-ie7.css" rel="stylesheet">
<![endif]-->
Expand Down Expand Up @@ -424,7 +424,7 @@
<!-- Placed javascript at the end for faster loading -->
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
<script src="//cdn.ubc.ca/clf/7.0.5/js/ubc-clf.min.js"></script>
<script src='/_assets/js/script.min.js?1' ></script>
<script src='/_assets/js/script.min.js?1' ></script>
<script type="text/javascript" id="lightning_bolt" src="//cdn-akamai.mookie1.com/LB/LightningBolt.js"></script>
</body>
</html>
2 changes: 2 additions & 0 deletions sleuth_crawler/tests/test_generic_page_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def test_parse_generic_item(self):
self.assertTrue(len(item['children']) > 0)
self.assertEqual(item['description'], "The University of British Columbia is a global centre for research and teaching, consistently ranked among the top 20 public universities in the world.")
self.assertEqual(item['children'], children)
self.assertEqual(item['title'], "Homepage")
self.assertEqual(item['site_title'], "The University of British Columbia")

# Check that there are no HTML tags, no blank lines, no JavaScript
html_regexp = re.compile(r'<[^>]*?>')
Expand Down

0 comments on commit 117c081

Please sign in to comment.