Skip to content

Commit

Permalink
Add mongodb item handler
Browse files Browse the repository at this point in the history
  • Loading branch information
vladcalin committed Nov 26, 2017
1 parent 4a9b098 commit 6bc85c2
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 6 deletions.
11 changes: 10 additions & 1 deletion crawlster/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from .validators import ValidationError, validate_isinstance, one_of
from .validators import ValidationError, validate_isinstance, one_of, is_url
from .exceptions import ConfigurationError, OptionNotDefinedError, \
MissingValueError

Expand Down Expand Up @@ -102,6 +102,15 @@ def __init__(self, choices, default=None, required=False,
super(ChoiceOption, self).__init__(validators, default, required)


class UrlOption(ConfigOption):
"""An option whose value must be a valid URL"""

def __init__(self, default=None, required=False,
extra_validators=None):
validators = [is_url] + (extra_validators or [])
super(UrlOption, self).__init__(validators, default, required)


#: The core options used by the framework core
CORE_OPTIONS = {
'core.start_step': StringOption(required=True),
Expand Down
51 changes: 51 additions & 0 deletions crawlster/handlers/db/mongo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
try:
import pymongo
except ImportError:
msg = """MongoDB item handler requires PyMongo. Run
pip install pymongo
and then try again.
"""
raise ImportError(msg)

from crawlster.handlers.base import BaseItemHandler
from crawlster.config import UrlOption, StringOption


class MongodbItemHandler(BaseItemHandler):
"""Mongodb item handler
Writes the submitted items directly to a MongoDB database.
Configuration options:
- ``mongodb.url`` - required, mongodb connection url.
- ``mongodb.database`` - the database to write to. Defaults to crawlster
- ``mongodb.collection`` - the collection to write to. Defaults to
``crawlster_items``
"""
config_options = {
'mongodb.url': UrlOption(required=True),
'mongodb.database': StringOption(default='crawlster'),
'mongodb.collection': StringOption(default='crawlster_items')
}

def __init__(self):
super(MongodbItemHandler, self).__init__()
self._conn = None
self._coll = None

def initialize(self):
"""Initializes the MongoDB connection"""
self._conn = pymongo.MongoClient(self.config.get('mongodb.url'))
db = self._conn[self.config.get('mongodb.database')]
self._coll = db[self.config.get('mongodb.collection')]

def handle(self, item):
"""Persists one entry to database"""
self._coll.insert_one(item)

def finalize(self):
"""Closes the connection"""
self._conn.close()
10 changes: 10 additions & 0 deletions crawlster/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
They are used mainly in the config options definitions.
"""
import urllib.parse


class ValidationError(Exception):
Expand Down Expand Up @@ -32,3 +33,12 @@ def actual_validator(value):
))

return actual_validator


def is_url(value):
"""Validates that the value represents a valid URL"""
result = urllib.parse.urlparse(value)
if result.scheme and result.netloc:
return True
else:
raise ValidationError('Invalid URL: {}'.format(value))
8 changes: 6 additions & 2 deletions examples/python_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from crawlster.config import Configuration
from crawlster.handlers.jsonl import JsonLinesHandler
from crawlster.handlers.log_handler import LogItemHandler
from crawlster.handlers.db.mongo import MongodbItemHandler


class PythonOrgCrawler(Crawlster):
Expand All @@ -22,10 +23,13 @@ class PythonOrgCrawler(Crawlster):
'core.start_urls': [
'https://docs.python.org/3/library/index.html'],
'log.level': 'debug',
'pool.workers': 3
'pool.workers': 3,

'mongodb.url': 'mongodb://localhost:27017'
})

item_handler = [LogItemHandler(), JsonLinesHandler('pymodules.jsonl')]
item_handler = [LogItemHandler(),
MongodbItemHandler()]

def step_start(self, url):
data = self.http.get(url)
Expand Down
8 changes: 5 additions & 3 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from crawlster.config import Configuration, ConfigurationError
from crawlster.exceptions import MissingValueError
from crawlster.validators import validate_isinstance, \
ValidationError, one_of
ValidationError, one_of, is_url


@pytest.mark.parametrize('config_opts, exc_type', [
Expand Down Expand Up @@ -44,8 +44,10 @@ def test_config_good_init(config_opts):
(one_of(1, 2, 3), 3, False),
(one_of(1, 2, 3), 4, True),
(one_of('item1', 'item2', 'item3'), 'item1', False),
(one_of('item1', 'item2', 'item3'), 'item5', True)
(one_of('item1', 'item2', 'item3'), 'item5', True),
(is_url, 'http://localhost:2222/test', False),
(is_url, 'http://localhost:2222', False),
(is_url, 'this_is_invalid', True),
])
def test_validators(validator, value, fails):
"""Validators raise exception only on invalid values"""
Expand Down

0 comments on commit 6bc85c2

Please sign in to comment.