Skip to content

Commit

Permalink
Merge dcef616 into 6d2152e
Browse files Browse the repository at this point in the history
  • Loading branch information
bobheadxi committed Nov 18, 2017
2 parents 6d2152e + dcef616 commit 15add00
Show file tree
Hide file tree
Showing 13 changed files with 179 additions and 55 deletions.
92 changes: 64 additions & 28 deletions sleuth_backend/solr/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,52 @@
import requests

class SolrConnection(object):
"""
'''
Connection to Solr database
"""
'''

# The number of documents held in a core's insert queue before
# the documents in the core are automatically inserted.
QUEUE_THRESHOLD = 50

def __init__(self, url):
"""
'''
Creates a SolrConnection form the given base Solr url of the form
'http://<solrhostname>:<port>/solr'.
"""
'''
self.url = url
self.solr = pysolr.Solr(url, timeout=10)
self.solr_admin = pysolr.SolrCoreAdmin(url + '/admin/cores')
self.cores = {}
self.queues = {}

for core_name in self.fetch_core_names():
self.cores[core_name] = pysolr.Solr(self.url + '/' + core_name)
self.queues[core_name] = list()

def fetch_core_names(self):
"""
'''
Makes a request to Solr and returns an array of strings where each
string is the name of a core in the response from Solr.
"""
'''
status_response = self.solr_admin.status()
status = json.loads(status_response)
return [core_name for core_name in status['status']]

def core_names(self):
"""
'''
Returns a list of known valid cores in the Solr instance without
making a request to Solr - this request excludes cores used for testing.
"""
'''
valid_cores = list(self.cores.keys())
if 'test' in valid_cores:
valid_cores.remove('test')
return valid_cores

def fetch_core_schema(self, name):
"""
'''
Returns the schema of the core with the given name as a dictionary.
"""
'''
response = self._get_url("{}/{}/schema".format(self.url, name), {})

if 'schema' not in response:
Expand All @@ -52,18 +58,49 @@ def fetch_core_schema(self, name):
return response['schema']

def insert_document(self, core, doc):
"""
Attempts to insert the given document (a dict) into the solr core with
the given name and returns the response from Solr. All values in 'doc'
must be strings.
"""
'''
Queues a document for insertion into the specified core and returns None.
If the number of documents in the queue exceeds a certain threshold,
this function will insert them all the documents held in the queue of the
specified core and return the response from Solr.
All values in 'doc' must be strings.
'''
if core not in self.cores:
raise ValueError("A core for the document type {} was not found".format(core))
return self.cores[core].add([doc])

self.queues[core].append(doc)

if len(self.queues[core]) >= self.QUEUE_THRESHOLD:
docs = list(self.queues[core].copy())
del self.queues[core][:]
return self.insert_documents(core, docs)

return None

def insert_documents(self, core_name, docs):
'''
Inserts given list of documents into specified core. Returns Solr response.
'''
if core_name not in self.cores:
raise ValueError('No Solr core with the name "{}" was found'.format(core_name))
print('Inserting '+str(len(docs))+' items into core '+core_name)
return self.cores[core_name].add(docs)

def insert_queued(self):
'''
Inserts all queued documents across all cores. Returns an object
containing the Solr response from each core.
'''
response = {}
for core in self.cores:
docs = list(self.queues[core].copy())
del self.queues[core][:]
response[core] = self.insert_documents(core, docs)
return response

def query(self, core, query, sort="", start="", rows="", default_field="",
search_fields="", return_fields="", highlight_fields="", omit_header=True):
"""
'''
Returns the response body from Solr corresponding to the given query.
See https://lucene.apache.org/solr/guide/6_6/common-query-parameters.html
and https://lucene.apache.org/solr/guide/6_6/highlighting.html
Expand All @@ -87,7 +124,7 @@ def query(self, core, query, sort="", start="", rows="", default_field="",
highlight_fields (str): Specifies a list of fields to highlight.
omit_header (bool): Whether or not Solr should include a header with
metadata about the query in its response.
"""
'''
params = {
"q": query,
"wt": "json",
Expand All @@ -112,23 +149,22 @@ def query(self, core, query, sort="", start="", rows="", default_field="",
return self._get_url("{}/{}/select".format(self.url, core), params)

def optimize(self, core_name=None):
"""
Performs defragmentation of all/specified core(s) in Solr database
Optionally accepts ``core_name``. Default is ``None`
"""
'''
Performs defragmentation of specified core in Solr database.
If no core is specified, defragments all cores.
'''
if core_name:
try:
self.cores[core_name].optimize()
except KeyError as e:
raise KeyError('No Solr core with the name "{}" was found'.format(core_name))
if core_name not in self.cores:
raise ValueError('No Solr core with the name "{}" was found'.format(core_name))
self.cores[core_name].optimize()
else:
for core in self.cores:
self.cores[core].optimize()

def _get_url(self, url, params):
"""
'''
Makes a request to the given url relative to the base url with the given
parameters and returns the response as a JSON string.
"""
'''
response = requests.get(url, params=pysolr.safe_urlencode(params))
return response.json()
2 changes: 1 addition & 1 deletion sleuth_backend/solr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def save_to_solr(self, solr_connection):
Submits the document to the given Solr connection. This method should not
be overridden.
"""
solr_connection.insert_document(self.type(), self.doc)
solr_connection.insert_document(self.type(), self.doc.copy())

def type(self):
"""
Expand Down
38 changes: 37 additions & 1 deletion sleuth_backend/tests/test_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,43 @@ def test_insert_document(self, admin_mock, solr_mock):
solr_connection.cores["genericPage"] = MockSolr()
doc = {"id": "testid"}
response = solr_connection.insert_document("genericPage", doc)
self.assertEqual([doc], response)
self.assertEqual(None, response)

with self.assertRaises(ValueError):
solr_connection.insert_document("blah", doc)

# Test auto insertion past QUEUE_THRESHOLD
response_docs = []
for _ in range(solr_connection.QUEUE_THRESHOLD - 2):
response = solr_connection.insert_document("genericPage", doc)
response = solr_connection.insert_document("genericPage", doc)
response_docs += solr_connection.QUEUE_THRESHOLD * [doc]
self.assertEqual(response_docs, response)

@patch('pysolr.Solr')
@patch('pysolr.SolrCoreAdmin')
def test_insert_documents(self, admin_mock, solr_mock):
solr_connection = self.create_instance(admin_mock, solr_mock)
solr_connection.cores["genericPage"] = MockSolr()
doc = {"id": "testid"}

with self.assertRaises(ValueError):
solr_connection.insert_document("blah", [doc])

response = solr_connection.insert_documents("genericPage", [doc,doc])
self.assertEqual([doc,doc], response)

@patch('pysolr.Solr')
@patch('pysolr.SolrCoreAdmin')
def test_insert_queued(self, admin_mock, solr_mock):
solr_connection = self.create_instance(admin_mock, solr_mock)
solr_connection.cores["genericPage"] = MockSolr()
doc = {"id": "testid"}

solr_connection.insert_document("genericPage", doc)
solr_connection.insert_document("core1", doc)
response = solr_connection.insert_queued()
self.assertEqual({"genericPage":[doc],"core1":[doc]}, response)

@patch('pysolr.Solr')
@patch('pysolr.SolrCoreAdmin')
Expand Down
6 changes: 4 additions & 2 deletions sleuth_backend/tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def test_for_fields(self):
str(query)
)
not_dict = "not clean"
self.failUnlessRaises(ValueError, query.for_fields, not_dict)
with self.assertRaises(ValueError):
query.for_fields(not_dict)

def test_boost_importance(self):
"""
Expand Down Expand Up @@ -80,7 +81,8 @@ def test_fuzz(self):
'''
query = Query("hello bruno").fuzz(2)
self.assertEqual('"hello bruno"~2', str(query))
self.failUnlessRaises(ValueError, query.fuzz, 7)
with self.assertRaises(ValueError):
query.fuzz(7)

def test_sanitation(self):
'''
Expand Down
8 changes: 5 additions & 3 deletions sleuth_crawler/scraper/scraper/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ def __init__(self, solr_connection=SOLR):
self.solr_connection = solr_connection

def close_spider(self, spider=None):
"""
'''
Defragment Solr database after spider completes task
"""
print("Scraper: Optimizing cores and closing spider")
'''
print("Closing scraper: Emptying all queued documents")
self.solr_connection.insert_queued()
print("Closing scraper: Optimizing all cores")
self.solr_connection.optimize()

def process_item(self, item, spider=None):
Expand Down
3 changes: 2 additions & 1 deletion sleuth_crawler/scraper/scraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
# Place specific domains before www.ubc.ca
PARENT_URLS = [
'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0',
'http://www.ubc.ca',
'https://www.ubyssey.ca',
'https://www.ubc.ca',
]

# Crawl responsibly by identifying yourself (and your website) on the user-agent
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def parse_subjects(response):
yield scrapy.Request(
next_url,
callback=parse_course,
priority=100,
meta={'data':subject}
)

Expand All @@ -51,6 +52,7 @@ def parse_course(response):
yield scrapy.Request(
next_url,
callback=parse_course_details,
priority=100,
meta={'data':course}
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,40 @@
from sleuth_crawler.scraper.scraper.items import ScrapyGenericPage

def parse_generic_item(response, links):
"""
'''
Scrape generic page
"""
site_title = ""
title = utils.extract_element(response.xpath("//title/text()"), 0)
'''
title = utils.extract_element(response.xpath("//title/text()"), 0).strip()
titles = re.split(r'\| | - ', title)
if len(titles) >= 2:

# Use OpenGraph title data if available
if len(response.xpath('//meta[@property="og:site_name"]')) > 0 and \
len(response.xpath('//meta[@property="og:title"]')) > 0:
title = utils.extract_element(
response.xpath('//meta[@property="og:title"]/@content'), 0
)
site_title = utils.extract_element(
response.xpath('//meta[@property="og:site_name"]/@content'), 0
)
elif len(titles) >= 2:
title = titles[0].strip()
site_titles = []
for i in range(max(1, len(titles)-2), len(titles)):
site_titles.append(titles[i].strip())
site_title = " - ".join(site_titles)
desc = utils.extract_element(response.xpath("//meta[@name='description']/@content"), 0)
site_title = ' - '.join(site_titles)
else:
site_title = ''

# Use OpenGraph description if available
if len(response.xpath('//meta[@property="og:description"]')) > 0:
desc = utils.extract_element(
response.xpath('//meta[@property="og:description"]/@content'), 0
)
else:
desc = utils.extract_element(
response.xpath('//meta[@name="description"]/@content'), 0
)

raw_content = utils.strip_content(response.body)

return ScrapyGenericPage(
Expand Down
5 changes: 3 additions & 2 deletions sleuth_crawler/scraper/scraper/spiders/parsers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ def strip_content(data):
for script in soup(["script", "style"]):
script.decompose()
data = soup.get_text()
# strip extraneous line breaks and sort into list
# strip extraneous line breaks, discard very short lines,
# return the remaining lines as a list
lines = []
for line in data.splitlines():
line = line.strip()
if line:
if line and len(line) > 5:
lines.append(line)
return lines
except Exception:
Expand Down
18 changes: 10 additions & 8 deletions sleuth_crawler/tests/test_course_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import re

class TestCourseParser(TestCase):
"""
'''
Test GenericCourseParser
parsers.course_oarser
"""
'''

def test_parse_subjects(self):
"""
'''
Test subjects parsing
"""
'''
response = mock_response('/test_data/subjects.txt', 'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=0')
output = list(parser.parse_subjects(response))
expected_subjects = [
Expand All @@ -30,12 +30,13 @@ def test_parse_subjects(self):
]
self.assertEquals(output[0].callback.__name__, parser.parse_course.__name__)
self.assertEquals(output[0].meta['data'],expected_subjects[0])
self.assertEquals(output[0].priority, 100)
self.assertEquals(output[1].meta['data'],expected_subjects[1])

def test_parse_course(self):
"""
'''
Test courses parsing
"""
'''
response = mock_response(
'/test_data/courses.txt',
'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=1&dept=ASTR'
Expand All @@ -56,12 +57,13 @@ def test_parse_course(self):
]
self.assertEquals(output[0].callback.__name__, parser.parse_course_details.__name__)
self.assertEquals(output[0].meta['data'],expected_courses[0])
self.assertEquals(output[0].priority, 100)
self.assertEquals(output[1].meta['data'],expected_courses[1])

def test_parse_course_details(self):
"""
'''
Test course details parsing
"""
'''
response = mock_response('/test_data/course_details.txt', 'https://courses.students.ubc.ca/cs/main?pname=subjarea&tname=subjareas&req=3&dept=ASTR&course=200')
response.meta['data'] = ScrapyCourseItem(subject="",url="",name="")
output = parser.parse_course_details(response)
Expand Down
Loading

0 comments on commit 15add00

Please sign in to comment.