Updated generic title parsing, tests

ubclaunchpad · Oct 23, 2017 · 117c081 · 117c081
1 parent 36cb5dd
commit 117c081
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 6 deletions.
diff --git a/sleuth_crawler/scraper/scraper/spiders/parsers/generic_page_parser.py b/sleuth_crawler/scraper/scraper/spiders/parsers/generic_page_parser.py
@@ -10,8 +10,8 @@ def parse_generic_item(response, children):
     title = utils.extract_element(response.xpath("//title/text()"), 0)
     titles = title.split('|')
     if len(titles) == 2:
-        title = utils.strip_content(titles[0])
-        site_title = utils.strip_content(titles[1])
+        title = titles[0].strip()
+        site_title = titles[1].strip()
     desc = utils.extract_element(response.xpath("//meta[@name='description']/@content"), 0)
     raw_content = utils.strip_content(response.body)
 

diff --git a/sleuth_crawler/tests/test_crawler.py b/sleuth_crawler/tests/test_crawler.py
@@ -42,6 +42,6 @@ def test_parse_generic_item(self, fake_parser):
         """
         Test crawler's redirect to generic_page_parser as default parser
         """
-        response = mock_response()
+        response = mock_response(file_name='/test_data/ubc.txt')
         self.spider.parse_generic_item(response)
         self.assertTrue(fake_parser.called)
diff --git a/sleuth_crawler/tests/test_data/ubc.txt b/sleuth_crawler/tests/test_data/ubc.txt
@@ -6,14 +6,14 @@
 <!--[[if (gt IE 9)|(gt IEMobile 7)]><!--><html lang="en"><!--<![endif]-->
 <head>
 <meta charset="utf-8">
-<title>The University of British Columbia</title>
+<title>Homepage | The University of British Columbia</title>
 <meta name="viewport" content="width=device-width">
 <meta name="description" content="The University of British Columbia is a global centre for research and teaching, consistently ranked among the top 20 public universities in the world.">
 <meta property="fb:pages" content="16761458703">
 <!-- Stylesheets -->
 <link href="//cdn.ubc.ca/clf/7.0.5/css/ubc-clf-full-bw.min.css" rel="stylesheet">
 <link rel="stylesheet" type="text/css" href="//cloud.typography.com/6804272/781004/css/fonts.css" />
-<link href='/_assets/css/style.min.css?0' rel='stylesheet'>
+<link href='/_assets/css/style.min.css?0' rel='stylesheet'>
 <!--[if lte IE 7]>
 <link href="https://cdn.ubc.ca/clf/7.0.5/css/font-awesome-ie7.css" rel="stylesheet">
 <![endif]-->
@@ -424,7 +424,7 @@
     <!-- Placed javascript at the end for faster loading -->
     <script src="//ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
     <script src="//cdn.ubc.ca/clf/7.0.5/js/ubc-clf.min.js"></script>
-    <script src='/_assets/js/script.min.js?1' ></script>
+    <script src='/_assets/js/script.min.js?1' ></script>
 	<script type="text/javascript" id="lightning_bolt" src="//cdn-akamai.mookie1.com/LB/LightningBolt.js"></script>
 </body>
 </html>
diff --git a/sleuth_crawler/tests/test_generic_page_parser.py b/sleuth_crawler/tests/test_generic_page_parser.py
@@ -23,6 +23,8 @@ def test_parse_generic_item(self):
         self.assertTrue(len(item['children']) > 0)
         self.assertEqual(item['description'], "The University of British Columbia is a global centre for research and teaching, consistently ranked among the top 20 public universities in the world.")
         self.assertEqual(item['children'], children)
+        self.assertEqual(item['title'], "Homepage")
+        self.assertEqual(item['site_title'], "The University of British Columbia")
 
         # Check that there are no HTML tags, no blank lines, no JavaScript
         html_regexp = re.compile(r'<[^>]*?>')