Skip to content

Commit

Permalink
Refactoring, improved http structure and implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
vladcalin committed Nov 26, 2017
1 parent 45e9541 commit 7d31aed
Show file tree
Hide file tree
Showing 13 changed files with 194 additions and 60 deletions.
2 changes: 1 addition & 1 deletion crawlster/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from crawlster.helpers import UrlsHelper, RegexHelper
from crawlster.exceptions import get_full_error_msg
from crawlster.helpers.queue import QueueHelper
from crawlster.helpers.request import RequestsHelper
from crawlster.helpers.http.requests import RequestsHelper
from crawlster.helpers.stats import StatsHelper


Expand Down
2 changes: 1 addition & 1 deletion crawlster/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .regex import RegexHelper
from .urls import UrlsHelper
from .extract import ExtractHelper
from .request import RequestsHelper
from .http.requests import RequestsHelper
from .stats import StatsHelper
from .log import LoggingHelper
from .base import BaseHelper
Expand Down
68 changes: 51 additions & 17 deletions crawlster/helpers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,70 @@
from crawlster.helpers.base import BaseHelper


class ExtractHelper(BaseHelper):
name = 'extract'
class Content(object):
"""Content wrapper that provides common data extraction methods"""

def __init__(self):
super(ExtractHelper, self).__init__()
def __init__(self, raw_data):
"""Wraps some text or bytes to be processed"""
if isinstance(raw_data, bytes):
raw_data = raw_data.decode()
self._data = raw_data
self._parsed_data = None

def css(self, text, selector, attr=None, content=None):
@property
def parsed_data(self):
"""Access the underlying bs4.BeautifulSoup4 instance
This property is provided for more advanced usage.
"""
if not self._parsed_data:
self._parsed_data = BeautifulSoup(self._data, 'html.parser')
return self._parsed_data

def css(self, pattern, get_attr=None, get_text=False):
"""Extracts data using css selector
Returns a list of elements (as strings) with the extracted data
Args:
text (str):
data to search in
selector:
pattern (str):
the CSS selector
attr (str or None):
get_attr (str or None):
if present, returns a list of the attributes of the extracted
items
content (bool):
get_text (bool):
If should return only the content/text of the element
Returns:
If get_attr and get_text are not specified, returns a list of
strings with the matches.
If get_attr is specified, returns a list
with the values of the specified attribute, if present. Elements
that match the query pattern and does not have that attribute are
ignored.
If get_text is specified, returns a list with the text from the
matched elements (direct children that are not nested tags).
"""
items = BeautifulSoup(text, 'html.parser').select(selector)
if attr:
return [i[attr] for i in items if attr in i.attrs]
elif content:
items = self.parsed_data.select(pattern)
if get_attr:
return [item[get_attr] for item in items if get_attr in item.attrs]
elif get_text:
return [i.text for i in items]
else:
return [str(i) for i in items]

def parse_bs4(self, text):
"""Parses data using BeautifulSoup4"""
return BeautifulSoup(text, 'html.parser')

class ExtractHelper(BaseHelper):
name = 'extract'

def __init__(self):
super(ExtractHelper, self).__init__()

def css(self, text, selector, attr=None, content=None):
"""Extracts data using css selector.
See :py:meth:``Content.css`` for more info.
"""
return Content(text).css(selector, get_attr=attr, get_text=content)
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
class HttpRequest(object):
"""Class representing a http request"""

def __init__(self, url, method, query_params=None, data=None, headers=None):
def __init__(self, url, method='GET', data=None, query_params=None,
headers=None):
"""Initializes a generic HTTP request
Args:
url (str):
The url of the request. Supported schemes: http and https
method (str):
method (str):
The HTTP verb
query_params (dict):
Mapping of query parameters
Expand All @@ -22,19 +23,19 @@ def __init__(self, url, method, query_params=None, data=None, headers=None):
"""
self.url = self.validate_url(url)
self.method = self.validate_method(method)
self.query_params = self.validate_query_params(query_params)
self.query_params = self.validate_query_params(query_params or {})
self.data = self.validate_data(data)

self.headers = self.get_default_headers()
self.headers.update(self.validate_headers(headers))
self.headers.update(self.validate_headers(headers) or {})

def validate_url(self, url):
"""Validates that the url is provided and has the proper scheme"""
if not url:
raise ValueError('url is required')
parsed = urllib.parse.urlparse(url)
if parsed.scheme not in ('http', 'https'):
raise ValueError('Invalid schema: {}'.format(parsed.schema))
raise ValueError('Invalid schema: {}'.format(parsed.scheme))
return url

def validate_method(self, method):
Expand Down Expand Up @@ -62,11 +63,15 @@ def validate_query_params(self, query_params):
def get_default_headers(self):
return {}

@property
def content_type(self):
return self.headers.get('Content-Type', 'application/octet-stream')


class GetRequest(HttpRequest):
"""A HTTP GET request"""

def __init__(self, url, query_params, headers):
def __init__(self, url, query_params=None, headers=None):
super(GetRequest, self).__init__(url=url, method='GET',
query_params=query_params,
headers=headers, data=None)
Expand All @@ -75,13 +80,13 @@ def __init__(self, url, query_params, headers):
class PostRequest(HttpRequest):
"""A HTTP POST request"""

def __init__(self, url, data, query_params, headers):
def __init__(self, url, data=None, query_params=None, headers=None):
super(PostRequest, self).__init__(url=url, data=data,
query_params=query_params,
headers=headers, method='POST')


class XhrRequest(PostRequest):
class XhrRequest(HttpRequest):
"""A XHR Post request"""

def get_default_headers(self):
Expand All @@ -90,8 +95,12 @@ def get_default_headers(self):

class JsonRequest(HttpRequest):
"""A generic JSON request.
The data must be an object that can be safely encoded as JSON.
Examples:
JsonRequest('http://example.com', 'POST', data={'hello': 'world'})
"""

def get_default_headers(self):
Expand All @@ -106,7 +115,7 @@ def validate_data(self, data):
"""Validates the data by converting it to json"""
if data:
try:
return json.dumps(data)
return json.dumps(data, separators=(',', ':'))
except ValueError:
raise ValueError(
'Unable to encode as JSON the request data: {}'.format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import requests.exceptions

from crawlster.helpers.base import BaseHelper
from crawlster.helpers.request.request import (
from crawlster.helpers.http.request import (
HttpRequest, GetRequest, PostRequest)
from crawlster.helpers.request.response import HttpResponse
from crawlster.helpers.http.response import HttpResponse


class RequestsHelper(BaseHelper):
"""Helper for making HTTP requests using the requests library"""
name = 'http'

STAT_DOWNLOAD = 'http.download'
Expand All @@ -29,11 +30,11 @@ def open(self, http_request: HttpRequest):
Args:
http_request (HttpRequest):
The crawlster.helpers.request.request.HttpRequest instance
The crawlster.helpers.http.request.HttpRequest instance
with the required info for making the request
Returns:
crawlster.helpers.request.response.HttpResponse
crawlster.helpers.http.response.HttpResponse
"""
self.crawler.stats.incr(self.STAT_REQUESTS)

Expand All @@ -50,7 +51,7 @@ def open(self, http_request: HttpRequest):
by=self._compute_resp_size(http_resp))
self.crawler.stats.incr(self.STAT_UPLOAD,
by=self._compute_req_size(http_request))
return
return http_resp
except requests.exceptions.RequestException as e:
self.crawler.stats.add(self.STAT_HTTP_ERRORS, e)
self.crawler.log.error(str(e))
Expand Down
51 changes: 51 additions & 0 deletions crawlster/helpers/http/response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
class HttpResponse(object):
"""Class representing a http response"""

def __init__(self, request, status_code, headers, body):
"""Initializes the http response object
Args:
request (HttpRequest):
The request that produces this response
status_code (int):
The status code as a number
headers (dict):
The response headers
body (bytes or None):
The body of the response, if any
"""
self.request = request
self.status_code = status_code
self.headers = headers
if isinstance(body, str):
body = body.encode()
if not isinstance(body, bytes):
raise TypeError(
'body must be in bytes, not {}'.format(type(body).__name__))
self.body = body

@property
def body_str(self):
"""Returns the decoded content of the request, if possible.
May raise UnicodeDecodeError if the body does not represent a valid
unicode encoded sequence.
"""
return self.body.decode()

@property
def body_bytes(self):
return self.body

@property
def server(self):
"""Returns the server header if available"""
return self.headers.get('Server')

@property
def content_type(self):
"""Returns the response content type if available"""
return self.headers.get('Content-Type')

def is_success(self):
return self.status_code < 400
20 changes: 0 additions & 20 deletions crawlster/helpers/request/response.py

This file was deleted.

6 changes: 3 additions & 3 deletions examples/python_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def step_start(self, url):
if not data:
return
self.urls.mark_seen(url)
hrefs = self.extract.css(data.content, 'a', attr='href')
hrefs = self.extract.css(data.body, 'a', attr='href')
self.log.warning(hrefs)
full_links = self.urls.multi_join(url, hrefs)
self.log.warning(full_links)
Expand All @@ -46,9 +46,9 @@ def process_page(self, url):
return
resp = self.http.get(url)
self.urls.mark_seen(url)
if not self.looks_like_module_page(resp.content):
if not self.looks_like_module_page(resp.body):
return
module_name = self.extract.css(resp.content,
module_name = self.extract.css(resp.body,
'h1 a.reference.internal code span',
content=True)
for res in module_name:
Expand Down
1 change: 1 addition & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-r test.txt
-e .
bumpversion
twine
sphinx
Expand Down
31 changes: 30 additions & 1 deletion tests/helpers/requests/test_http_request.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,37 @@
import pytest

from crawlster.helpers.request.request import HttpRequest
from crawlster.helpers.http.request import (
HttpRequest, GetRequest, PostRequest, XhrRequest, JsonRequest)


def test_request_invalid_method():
with pytest.raises(ValueError):
HttpRequest(method='invalid', url="http://example.com")


def test_request_invalid_url():
with pytest.raises(ValueError):
HttpRequest('invalid_url')


# noinspection PyTypeChecker
def test_request_invalid_headers():
with pytest.raises(TypeError):
HttpRequest('http://localhost', headers='invalid type')


@pytest.mark.parametrize('obj_type, init_args, expected_attrs', [
(GetRequest, ('http://localhost',), {'method': 'GET'}),
(PostRequest, ('http://localhost',), {'method': 'POST'}),
(XhrRequest, ('http://localhost',),
{'method': 'GET', 'headers': {'X-Requested-With': 'XMLHttpRequest'}}),
(JsonRequest, ('http://localhost', 'POST', {'hello': 'world'}),
{
'content_type': 'application/json',
'data': '{"hello":"world"}'
})
])
def test_request_initialisation(obj_type, init_args, expected_attrs):
instance = obj_type(*init_args)
for k, v in expected_attrs.items():
assert getattr(instance, k) == v

0 comments on commit 7d31aed

Please sign in to comment.