Skip to content

Commit

Permalink
Merge 49195d7 into 800c610
Browse files Browse the repository at this point in the history
  • Loading branch information
bobheadxi committed Nov 25, 2017
2 parents 800c610 + 49195d7 commit cc5f5c2
Show file tree
Hide file tree
Showing 18 changed files with 374 additions and 90 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,9 @@ $ bash scripts/populate.sh
For live data, you can currently run the `BroadCrawler`, which scrapes a few thousand pages and pipelines them into the appropriate cores based on their type.

```Shell
$ bash cd sleuth_crawler/scraper && scrapy crawl broad_crawler
$ bash sleuth_crawler/run_crawlers.sh
```

At the moment the crawler never really seems to stop, so you will likely have to force it to quit when you have sufficient data entries.

To empty a core, go to:
```
http://localhost:8983/solr/[CORE_NAME_HERE]/update?stream.body=<delete><query>*:*</query></delete>&commit=true
Expand Down
3 changes: 2 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ services:
entrypoint: [
"/opt/docker-solr/scripts/start-solr.sh",
"genericPage",
"courseItem"
"courseItem",
"redditPost",
]

web:
Expand Down
21 changes: 20 additions & 1 deletion sleuth_backend/solr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,27 @@ class CourseItem(SolrDocument):
"updatedAt": "",
"description": "",
"subjectId": "",
"subjectData": []
"subjectName": "",
"faculty": "",
}

def __init__(self, **kwargs):
super(CourseItem, self).__init__(self.doc, **kwargs)

class RedditPost(SolrDocument):
'''
Represents a Reddit post
'''
doc = {
"id": "",
"type": "redditPost",
"name": "",
"updatedAt": "",
"description": "",
"comments": [],
"subreddit": "",
"links": [],
}

def __init__(self, **kwargs):
super(RedditPost, self).__init__(self.doc, **kwargs)
3 changes: 2 additions & 1 deletion sleuth_backend/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def create_page(self, t):
"updatedAt": "1234",
"description": "testdescription",
"subjectId": "test",
"subjectData": []
"subjectName": "ubc",
"faculty": "engineering"
}
return (args, CourseItem(**args))

Expand Down
11 changes: 10 additions & 1 deletion sleuth_backend/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_apis_without_params(self, mock_query):
@patch('sleuth_backend.solr.connection.SolrConnection.core_names')
@patch('sleuth_backend.solr.connection.SolrConnection.query')
def test_apis_with_valid_request(self, mock_query, mock_cores):
# search
# genericPage search
mock_query.return_value = {
"type": "genericPage",
"response": {
Expand Down Expand Up @@ -113,6 +113,15 @@ def test_apis_with_valid_request(self, mock_query, mock_cores):
{'data': [{'type': 'courseItem', 'response': {'numFound': 1, 'start': 0, 'docs': [{'id': 'www.cool.com', 'description': 'Nice one dude', 'updatedAt': '', 'name': '', 'content': ''}]}, 'highlighting': {'www.cool.com': {'content': ['Nice one dude']}}}, {'type': 'courseItem', 'response': {'numFound': 1, 'start': 0, 'docs': [
{'id': 'www.cool.com', 'description': 'Nice one dude', 'updatedAt': '', 'name': '', 'content': ''}]}, 'highlighting': {'www.cool.com': {'content': ['Nice one dude']}}}], 'request': {'query': 'somequery', 'types': ['courseItem', 'courseItem'], 'return_fields': ['id', 'updatedAt', 'name', 'description'], 'state': ''}}
)

# redditPost search
mock_query.return_value['type'] = 'redditPost'
mock_query.return_value['highlighting']['www.cool.com'] = {'content': ['Nice']}
params = { 'q': 'somequery', 'type': 'redditPost' }
mock_request = MockRequest('GET', get=MockGet(params))
result = search(mock_request)
self.assertEqual(result.status_code, 200)
mock_response = mock_query.return_value

# getdocument
params = {
Expand Down
23 changes: 18 additions & 5 deletions sleuth_backend/views/views_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def build_search_query(core, query_str, base_kwargs):
'''
kwargs = base_kwargs.copy()

if core == "genericPage":
if core == 'genericPage':
fields = {
'id': 1,
'name': 8,
Expand All @@ -58,23 +58,36 @@ def build_search_query(core, query_str, base_kwargs):
.for_fields(fields)
query = query.select_or(terms_query)
kwargs['default_field'] = 'content'
kwargs['highlight_fields'] = 'content'
kwargs['highlight_fields'] = 'content,description'

elif core == "courseItem":
elif core == 'courseItem':
fields = {
'id': 1,
'name': 9,
'description': 8,
'subjectData': 5,
}
query = Query(query_str) \
.fuzz(2)
query = Query(query_str).fuzz(2)
terms_query = Query(query_str, as_phrase=False, escape=True, sanitize=True) \
.for_fields(fields)
query = query.select_or(terms_query)
kwargs['default_field'] = 'name'
kwargs['highlight_fields'] = 'description'

elif core == 'redditPost':
fields = {
'id': 1,
'name': 7,
'description': 10,
'comments': 6,
}
query = Query(query_str).fuzz(1)
terms_query = Query(query_str, as_phrase=False, escape=True, sanitize=True) \
.for_fields(fields)
query = query.select_or(terms_query)
kwargs['default_field'] = 'name'
kwargs['highlight_fields'] = 'description,comments'

else:
query = Query(query_str)

Expand Down
6 changes: 6 additions & 0 deletions sleuth_crawler/run_crawlers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
# Runs all crawlers

echo 'Spinning up crawlers...'
cd sleuth_crawler/scraper
python crawl.py & read -t 10 ; kill $!
23 changes: 23 additions & 0 deletions sleuth_crawler/scraper/crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys
import os.path
PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
sys.path = sys.path + [os.path.join(PROJECT_ROOT, '../../..'), os.path.join(PROJECT_ROOT, '../..')]

from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from sleuth_crawler.scraper.scraper.settings import CUSTOM_URLS
from sleuth_crawler.scraper.scraper.spiders.parsers.course_parser import parse_subjects

'''
This script runs all our spiders.
Crawl specific item types, such as CourseItem, before starting the broad_crawler
'''

def run():
process = CrawlerProcess(get_project_settings())
process.crawl('broad_crawler')
process.crawl('custom_crawler', start_urls=CUSTOM_URLS['courseItem'], parser=parse_subjects)
process.start()

if __name__ == "__main__":
run()
21 changes: 17 additions & 4 deletions sleuth_crawler/scraper/scraper/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import scrapy

class ScrapyGenericPage(scrapy.Item):
"""
'''
Stores generic page data and url
"""
'''
url = scrapy.Field()
title = scrapy.Field()
site_title = scrapy.Field()
Expand All @@ -19,11 +19,24 @@ class ScrapyGenericPage(scrapy.Item):
links = scrapy.Field()

class ScrapyCourseItem(scrapy.Item):
"""
'''
Stores data about a course and associated subject
SubItems: ScrapySectionItem, ScrapySubjectItem
"""
'''
subject = scrapy.Field()
url = scrapy.Field()
name = scrapy.Field()
description = scrapy.Field()

class ScrapyRedditPost(scrapy.Item):
'''
Stores data about a reddit post and its comments
section
'''
url = scrapy.Field()
title = scrapy.Field()
subreddit = scrapy.Field()
post_content = scrapy.Field()
comments = scrapy.Field()
links = scrapy.Field()

48 changes: 32 additions & 16 deletions sleuth_crawler/scraper/scraper/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,24 @@ def close_spider(self, spider=None):
self.solr_connection.optimize()

def process_item(self, item, spider=None):
"""
'''
Match item type to predefined Schemas
https://github.com/ubclaunchpad/sleuth/wiki/Schemas
"""
'''
if isinstance(item, ScrapyGenericPage):
self.__process_generic_page(item)
if isinstance(item, ScrapyCourseItem):
elif isinstance(item, ScrapyCourseItem):
self.__process_course_item(item)
elif isinstance(item, ScrapyRedditPost):
self.__process_reddit_post(item)

return item

def __process_generic_page(self, item):
"""
'''
Convert Scrapy item to Solr GenericPage and commit it to database
Schema specified by sleuth_backend.solr.models.GenericPage
"""
'''
solr_doc = GenericPage(
id=item["url"],
type="genericPage",
Expand All @@ -55,20 +57,34 @@ def __process_generic_page(self, item):
solr_doc.save_to_solr(self.solr_connection)

def __process_course_item(self, item):
"""
'''
Convert Scrapy item to Solr CourseItem and commit it to database
Schema specified by sleuth_backend.solr.models.CourseItem
"""
subject = item["subject"]
subject_data = [subject["name"], subject["faculty"]]
'''
subject = item['subject']
solr_doc = CourseItem(
id=item["url"],
type="courseItem",
name=item["name"],
id=item['url'],
type='courseItem',
name=item['name'],
updatedAt=self.__make_date(),
description=item["description"],
subjectId=subject["url"],
subjectData=subject_data
description=item['description'],
subjectId=subject['url'],
subjectName=subject['name'],
faculty=subject['faculty']
)
solr_doc.save_to_solr(self.solr_connection)

def __process_reddit_post(self, item):
'''
Convert Scrapy item to Solr ReddiPost and commit it to database
'''
solr_doc = RedditPost(
id=item['url'],
type='redditPost',
name=item['title'],
updatedAt=self.__make_date(),
description=item['post_content'],
comments=item['comments'],
links=item['links'],
)
solr_doc.save_to_solr(self.solr_connection)

Expand Down
9 changes: 7 additions & 2 deletions sleuth_crawler/scraper/scraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,16 @@
# List approved starting URLs to be crawled by BroadCrawler
# Place specific domains before www.ubc.ca
PARENT_URLS = [
'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0',
'https://www.ubyssey.ca',
'https://reddit.com/r/ubc',
'https://www.ubc.ca',
]

# Specific starting URLs to be crawled by a CustomCrawler
CUSTOM_URLS = {
'courseItem': ['https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0'],
}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'courses_scraper (+http://www.yourdomain.com)'

Expand All @@ -37,7 +42,7 @@
DEPTH_PRIORITY = 50

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Enable SSL Handshakes
DOWNLOADER_CLIENTCONTEXTFACTORY = 'scraper.customcontext.CustomContextFactory'
Expand Down
Loading

0 comments on commit cc5f5c2

Please sign in to comment.