redditPost search, refactored spiders, crawl script

ubclaunchpad · Nov 25, 2017 · fdc53b9 · fdc53b9
1 parent fbf10ff
commit fdc53b9
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -63,11 +63,9 @@ $ bash scripts/populate.sh
 For live data, you can currently run the `BroadCrawler`, which scrapes a few thousand pages and pipelines them into the appropriate cores based on their type.
 
 ```Shell
-$ bash cd sleuth_crawler/scraper && scrapy crawl broad_crawler
+$ bash sleuth_crawler/run_crawlers.sh
 ```
 
-At the moment the crawler never really seems to stop, so you will likely have to force it to quit when you have sufficient data entries.
-
 To empty a core, go to:
 ```
 http://localhost:8983/solr/[CORE_NAME_HERE]/update?stream.body=<delete><query>*:*</query></delete>&commit=true

diff --git a/sleuth_backend/views/views_utils.py b/sleuth_backend/views/views_utils.py
@@ -42,7 +42,7 @@ def build_search_query(core, query_str, base_kwargs):
     '''
     kwargs = base_kwargs.copy()
 
-    if core == "genericPage":
+    if core == 'genericPage':
         fields = {
             'id': 1,
             'name': 8,
@@ -58,9 +58,9 @@ def build_search_query(core, query_str, base_kwargs):
             .for_fields(fields)
         query = query.select_or(terms_query)
         kwargs['default_field'] = 'content'
-        kwargs['highlight_fields'] = 'content'
+        kwargs['highlight_fields'] = 'content,description'
 
-    elif core == "courseItem":
+    elif core == 'courseItem':
         fields = {
             'id': 1,
             'name': 9,
@@ -75,6 +75,21 @@ def build_search_query(core, query_str, base_kwargs):
         kwargs['default_field'] = 'name'
         kwargs['highlight_fields'] = 'description'
 
+    elif core == 'redditPost':
+        fields = {
+            'id': 1,
+            'name': 7,
+            'description': 10,
+            'comments': 6,
+        }
+        query = Query(query_str) \
+            .fuzz(1)
+        terms_query = Query(query_str, as_phrase=False, escape=True, sanitize=True) \
+            .for_fields(fields)
+        query = query.select_or(terms_query)
+        kwargs['default_field'] = 'name'
+        kwargs['highlight_fields'] = 'description,comments'
+
     else:
         query = Query(query_str)
 

diff --git a/sleuth_crawler/run_crawlers.sh b/sleuth_crawler/run_crawlers.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Runs all crawlers
+
+echo 'Spinning up crawlers...'
+cd sleuth_crawler/scraper
+python crawl.py & read -t 10 ; kill $!
diff --git a/sleuth_crawler/scraper/crawl.py b/sleuth_crawler/scraper/crawl.py
@@ -0,0 +1,23 @@
+import sys
+import os.path
+PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
+sys.path = sys.path + [os.path.join(PROJECT_ROOT, '../../..'), os.path.join(PROJECT_ROOT, '../..')]
+
+from scrapy.crawler import CrawlerProcess, CrawlerRunner
+from scrapy.utils.project import get_project_settings
+from sleuth_crawler.scraper.scraper.settings import CUSTOM_URLS
+from sleuth_crawler.scraper.scraper.spiders.parsers.course_parser import parse_subjects
+
+'''
+This script runs all our spiders.
+Crawl specific item types, such as CourseItem, before starting the broad_crawler
+'''
+
+def run():
+    process = CrawlerProcess(get_project_settings())
+    process.crawl('broad_crawler')
+    process.crawl('custom_crawler', start_urls=CUSTOM_URLS['courseItem'], parser=parse_subjects)
+    process.start()
+
+if __name__ == "__main__":
+    run()
diff --git a/sleuth_crawler/scraper/scraper/settings.py b/sleuth_crawler/scraper/scraper/settings.py
@@ -23,12 +23,16 @@
 # List approved starting URLs to be crawled by BroadCrawler
 # Place specific domains before www.ubc.ca
 PARENT_URLS = [
-    'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0',
     'https://www.ubyssey.ca',
     'https://reddit.com/r/ubc',
     'https://www.ubc.ca',
 ]
 
+# Specific starting URLs to be crawled by a CustomCrawler
+CUSTOM_URLS = {
+    'courseItem': ['https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0'],
+}
+
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'courses_scraper (+http://www.yourdomain.com)'
 
@@ -38,7 +42,7 @@
 DEPTH_PRIORITY = 50
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Enable SSL Handshakes
 DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.customcontext.CustomContextFactory'

diff --git a/sleuth_crawler/scraper/scraper/spiders/broad_crawler.py b/sleuth_crawler/scraper/scraper/spiders/broad_crawler.py
@@ -1,36 +1,38 @@
 import scrapy
-from sleuth_crawler.scraper.scraper.spiders.parsers import generic_page_parser, course_parser, reddit_parser
+from sleuth_crawler.scraper.scraper.spiders.parsers import generic_page_parser, reddit_parser
 from scrapy.spiders import Rule, CrawlSpider
 from scrapy.linkextractors import LinkExtractor
 from sleuth_crawler.scraper.scraper.settings import PARENT_URLS
 
 class BroadCrawler(CrawlSpider):
-    """
+    '''
     Spider that broad crawls starting at list of predefined URLs
-    """
-    name = "broad_crawler"
-
-    # Root URLs of special page types are stored here
-    ROOTS = {
-        'courses':'courses.students.ubc.ca',
-        'reddit':'www.reddit.com'
-    }
+    '''
+    name = 'broad_crawler'
 
     # These are the links that the crawler starts crawling at
     start_urls = PARENT_URLS
 
     # Rules for what links are followed are defined here
-    GENERIC_LINK_EXTRACTOR = LinkExtractor(
-        allow=(r'ubc', r'university', r'ubyssey', r'prof', r'student'),
-        deny=(r'accounts\.google', r'intent', r'lang=')
-    )
+    allowed_terms = [r'(ubc)', r'(university)', r'(ubyssey)', r'(prof)', r'(student)', r'(faculty)']
+    denied_terms = [r'(accounts\.google)', r'(intent)', r'(lang=)']
+    GENERIC_LINK_EXTRACTOR = LinkExtractor(allow=allowed_terms, deny=denied_terms, deny_domains=['www.reddit.com'])
+
     rules = (
+        # redditPost
         Rule(
-            GENERIC_LINK_EXTRACTOR,
-            follow=True,
-            process_request='process_req',
-            callback='parse_generic_item'
+            LinkExtractor(
+                allow_domains='www.reddit.com',
+                allow=allowed_terms, deny=denied_terms,
+            ),
+            follow=True, process_request='process_reddit_request'
         ),
+
+        # genericPage
+        Rule(
+            GENERIC_LINK_EXTRACTOR,
+            follow=True, callback='parse_generic_item'
+        )
     )
 
     # Specifies the pipeline that handles data returned from the parsers
@@ -40,36 +42,33 @@ class BroadCrawler(CrawlSpider):
         }
     }
 
-    def process_req(self, req):
+    def process_reddit_request(self, req):
         '''
-        Work on requests to identify those that qualify for special treatment
+        Processes requests generated by reddit.com link extractor
         '''
-        # Parse courseItem
-        if self.ROOTS['courses'] in req.url:
-            # This is the root Course page that starts the course parser
-            if 'cs/main?pname=subjarea&tname=subjareas&req=0' in req.url:
-                return req.replace(
-                    callback=course_parser.parse_subjects,
-                    priority=100
-                )
-            else:
-                return
-
-        # Parse redditPost
-        if self.ROOTS['reddit'] in req.url:
+        if 'reddit.com' in req.url:
+            req.replace(priority=100)
             if 'comments' in req.url:
-                return req.replace(
-                    callback=self.parse_reddit_post,
-                    priority=100
-                )
+                req.replace(callback='parse_reddit_post')
             else:
-                return req.replace(
-                    callback=self.no_parse,
-                    priority=80
-                )
+                req.replace(callback='no_parse')
+        else:
+            req.replace(callback='parse_generic_item')
 
         return req
 
+    def parse_start_urls(self, resp):
+        '''
+        Parses the start_urls
+        '''
+        if 'reddit.com' in resp.url:
+            return self.parse_reddit_post(resp)
+        return self.parse_generic_item(resp)
+
+    ###############
+    ## CALLBACKS ##
+    ###############
+
     def parse_generic_item(self, response):
         '''
         Points to generic_page_parser (the default parser for this crawler)
@@ -89,6 +88,7 @@ def no_parse(self, response):
         Visit page without parsing it - this allows the URLS of this page to
         be extracted and visited if there are any relevant links
         '''
+        # TODO: does this work?
         return
 
     def _get_links(self, response):

diff --git a/sleuth_crawler/scraper/scraper/spiders/custom_crawler.py b/sleuth_crawler/scraper/scraper/spiders/custom_crawler.py
@@ -0,0 +1,23 @@
+import scrapy
+from scrapy.spiders import Spider
+
+class CustomCrawler(Spider):
+    '''
+    Spider that crawls specific domains for specific item types
+    that don't link to genericItem
+    '''
+    name = 'custom_crawler'
+
+    # Specifies the pipeline that handles data returned from the parsers
+    custom_settings = {
+        'ITEM_PIPELINES': {
+            'scraper.pipelines.SolrPipeline': 400,
+        }
+    }
+
+    def __init__(self, start_urls, parser):
+        '''
+        Takes a list of starting urls and a callback for each link visited
+        '''
+        self.start_urls = start_urls
+        self.parse = parser
diff --git a/sleuth_crawler/tests/test_crawler.py b/sleuth_crawler/tests/test_crawler.py
@@ -13,31 +13,6 @@ class TestBroadCralwer(TestCase):
     def setUp(self):
         self.spider = BroadCrawler()
 
-    def test_request_filtering(self):
-        '''
-        Test filtering normal requests
-        '''
-        # Direct non-matching requests to default parser (GenericPage)
-        req_in = scrapy.Request('https://www.ubc.ca')
-        req = self.spider.process_req(req_in)
-        self.assertTrue(req)
-        self.assertFalse(req.callback)
-
-    def test_request_filtering_course(self):
-        '''
-        Test filtering course requests
-        '''
-        # Redirect to parse_subjects if parent courses page
-        req_pass = scrapy.Request('https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0')
-        req = self.spider.process_req(req_pass)
-        self.assertEqual(req.callback.__name__, course_parser.parse_subjects.__name__)
-        self.assertEqual(req.priority, 100)
-
-        # Discard request if children courses page
-        req_discard = scrapy.Request('https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=ASTR')
-        req = self.spider.process_req(req_discard)
-        self.assertFalse(req)
-
     @patch('sleuth_crawler.scraper.scraper.spiders.parsers.generic_page_parser.parse_generic_item')
     def test_parse_generic_item(self, fake_parser):
         '''