Skip to content

Commit

Permalink
redditPost search, refactored spiders, crawl script
Browse files Browse the repository at this point in the history
  • Loading branch information
bobheadxi committed Nov 25, 2017
1 parent fbf10ff commit fdc53b9
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 74 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,9 @@ $ bash scripts/populate.sh
For live data, you can currently run the `BroadCrawler`, which scrapes a few thousand pages and pipelines them into the appropriate cores based on their type.

```Shell
$ bash cd sleuth_crawler/scraper && scrapy crawl broad_crawler
$ bash sleuth_crawler/run_crawlers.sh
```

At the moment the crawler never really seems to stop, so you will likely have to force it to quit when you have sufficient data entries.

To empty a core, go to:
```
http://localhost:8983/solr/[CORE_NAME_HERE]/update?stream.body=<delete><query>*:*</query></delete>&commit=true
Expand Down
21 changes: 18 additions & 3 deletions sleuth_backend/views/views_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def build_search_query(core, query_str, base_kwargs):
'''
kwargs = base_kwargs.copy()

if core == "genericPage":
if core == 'genericPage':
fields = {
'id': 1,
'name': 8,
Expand All @@ -58,9 +58,9 @@ def build_search_query(core, query_str, base_kwargs):
.for_fields(fields)
query = query.select_or(terms_query)
kwargs['default_field'] = 'content'
kwargs['highlight_fields'] = 'content'
kwargs['highlight_fields'] = 'content,description'

elif core == "courseItem":
elif core == 'courseItem':
fields = {
'id': 1,
'name': 9,
Expand All @@ -75,6 +75,21 @@ def build_search_query(core, query_str, base_kwargs):
kwargs['default_field'] = 'name'
kwargs['highlight_fields'] = 'description'

elif core == 'redditPost':
fields = {
'id': 1,
'name': 7,
'description': 10,
'comments': 6,
}
query = Query(query_str) \
.fuzz(1)
terms_query = Query(query_str, as_phrase=False, escape=True, sanitize=True) \
.for_fields(fields)
query = query.select_or(terms_query)
kwargs['default_field'] = 'name'
kwargs['highlight_fields'] = 'description,comments'

else:
query = Query(query_str)

Expand Down
6 changes: 6 additions & 0 deletions sleuth_crawler/run_crawlers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# Runs all crawlers

echo 'Spinning up crawlers...'
cd sleuth_crawler/scraper
python crawl.py & read -t 10 ; kill $!
23 changes: 23 additions & 0 deletions sleuth_crawler/scraper/crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys
import os.path
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
sys.path = sys.path + [os.path.join(PROJECT_ROOT, '../../..'), os.path.join(PROJECT_ROOT, '../..')]

from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from sleuth_crawler.scraper.scraper.settings import CUSTOM_URLS
from sleuth_crawler.scraper.scraper.spiders.parsers.course_parser import parse_subjects

'''
This script runs all our spiders.
Crawl specific item types, such as CourseItem, before starting the broad_crawler
'''

def run():
process = CrawlerProcess(get_project_settings())
process.crawl('broad_crawler')
process.crawl('custom_crawler', start_urls=CUSTOM_URLS['courseItem'], parser=parse_subjects)
process.start()

if __name__ == "__main__":
run()
8 changes: 6 additions & 2 deletions sleuth_crawler/scraper/scraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@
# List approved starting URLs to be crawled by BroadCrawler
# Place specific domains before www.ubc.ca
PARENT_URLS = [
'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0',
'https://www.ubyssey.ca',
'https://reddit.com/r/ubc',
'https://www.ubc.ca',
]

# Specific starting URLs to be crawled by a CustomCrawler
CUSTOM_URLS = {
'courseItem': ['https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0'],
}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'courses_scraper (+http://www.yourdomain.com)'

Expand All @@ -38,7 +42,7 @@
DEPTH_PRIORITY = 50

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Enable SSL Handshakes
DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.customcontext.CustomContextFactory'
Expand Down
82 changes: 41 additions & 41 deletions sleuth_crawler/scraper/scraper/spiders/broad_crawler.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,38 @@
import scrapy
from sleuth_crawler.scraper.scraper.spiders.parsers import generic_page_parser, course_parser, reddit_parser
from sleuth_crawler.scraper.scraper.spiders.parsers import generic_page_parser, reddit_parser
from scrapy.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
from sleuth_crawler.scraper.scraper.settings import PARENT_URLS

class BroadCrawler(CrawlSpider):
"""
'''
Spider that broad crawls starting at list of predefined URLs
"""
name = "broad_crawler"

# Root URLs of special page types are stored here
ROOTS = {
'courses':'courses.students.ubc.ca',
'reddit':'www.reddit.com'
}
'''
name = 'broad_crawler'

# These are the links that the crawler starts crawling at
start_urls = PARENT_URLS

# Rules for what links are followed are defined here
GENERIC_LINK_EXTRACTOR = LinkExtractor(
allow=(r'ubc', r'university', r'ubyssey', r'prof', r'student'),
deny=(r'accounts\.google', r'intent', r'lang=')
)
allowed_terms = [r'(ubc)', r'(university)', r'(ubyssey)', r'(prof)', r'(student)', r'(faculty)']
denied_terms = [r'(accounts\.google)', r'(intent)', r'(lang=)']
GENERIC_LINK_EXTRACTOR = LinkExtractor(allow=allowed_terms, deny=denied_terms, deny_domains=['www.reddit.com'])

rules = (
# redditPost
Rule(
GENERIC_LINK_EXTRACTOR,
follow=True,
process_request='process_req',
callback='parse_generic_item'
LinkExtractor(
allow_domains='www.reddit.com',
allow=allowed_terms, deny=denied_terms,
),
follow=True, process_request='process_reddit_request'
),

# genericPage
Rule(
GENERIC_LINK_EXTRACTOR,
follow=True, callback='parse_generic_item'
)
)

# Specifies the pipeline that handles data returned from the parsers
Expand All @@ -40,36 +42,33 @@ class BroadCrawler(CrawlSpider):
}
}

def process_req(self, req):
def process_reddit_request(self, req):
'''
Work on requests to identify those that qualify for special treatment
Processes requests generated by reddit.com link extractor
'''
# Parse courseItem
if self.ROOTS['courses'] in req.url:
# This is the root Course page that starts the course parser
if 'cs/main?pname=subjarea&tname=subjareas&req=0' in req.url:
return req.replace(
callback=course_parser.parse_subjects,
priority=100
)
else:
return

# Parse redditPost
if self.ROOTS['reddit'] in req.url:
if 'reddit.com' in req.url:
req.replace(priority=100)
if 'comments' in req.url:
return req.replace(
callback=self.parse_reddit_post,
priority=100
)
req.replace(callback='parse_reddit_post')
else:
return req.replace(
callback=self.no_parse,
priority=80
)
req.replace(callback='no_parse')
else:
req.replace(callback='parse_generic_item')

return req

def parse_start_urls(self, resp):
'''
Parses the start_urls
'''
if 'reddit.com' in resp.url:
return self.parse_reddit_post(resp)
return self.parse_generic_item(resp)

###############
## CALLBACKS ##
###############

def parse_generic_item(self, response):
'''
Points to generic_page_parser (the default parser for this crawler)
Expand All @@ -89,6 +88,7 @@ def no_parse(self, response):
Visit page without parsing it - this allows the URLS of this page to
be extracted and visited if there are any relevant links
'''
# TODO: does this work?
return

def _get_links(self, response):
Expand Down
23 changes: 23 additions & 0 deletions sleuth_crawler/scraper/scraper/spiders/custom_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import scrapy
from scrapy.spiders import Spider

class CustomCrawler(Spider):
'''
Spider that crawls specific domains for specific item types
that don't link to genericItem
'''
name = 'custom_crawler'

# Specifies the pipeline that handles data returned from the parsers
custom_settings = {
'ITEM_PIPELINES': {
'scraper.pipelines.SolrPipeline': 400,
}
}

def __init__(self, start_urls, parser):
'''
Takes a list of starting urls and a callback for each link visited
'''
self.start_urls = start_urls
self.parse = parser
25 changes: 0 additions & 25 deletions sleuth_crawler/tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,6 @@ class TestBroadCralwer(TestCase):
def setUp(self):
self.spider = BroadCrawler()

def test_request_filtering(self):
'''
Test filtering normal requests
'''
# Direct non-matching requests to default parser (GenericPage)
req_in = scrapy.Request('https://www.ubc.ca')
req = self.spider.process_req(req_in)
self.assertTrue(req)
self.assertFalse(req.callback)

def test_request_filtering_course(self):
'''
Test filtering course requests
'''
# Redirect to parse_subjects if parent courses page
req_pass = scrapy.Request('https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0')
req = self.spider.process_req(req_pass)
self.assertEqual(req.callback.__name__, course_parser.parse_subjects.__name__)
self.assertEqual(req.priority, 100)

# Discard request if children courses page
req_discard = scrapy.Request('https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=ASTR')
req = self.spider.process_req(req_discard)
self.assertFalse(req)

@patch('sleuth_crawler.scraper.scraper.spiders.parsers.generic_page_parser.parse_generic_item')
def test_parse_generic_item(self, fake_parser):
'''
Expand Down

0 comments on commit fdc53b9

Please sign in to comment.