Skip to content

Commit

Permalink
Documentation for version 1.6
Browse files Browse the repository at this point in the history
  • Loading branch information
tijme committed Jun 28, 2017
1 parent b5087b0 commit 7ebdebb
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 23 deletions.
3 changes: 2 additions & 1 deletion README.rst
Expand Up @@ -97,7 +97,8 @@ You can also use the `kitchen sink <https://tijme.github.io/not-your-average-web
print("Crawler started.")
def cb_crawler_after_finish(queue):
print("Crawler finished, found " + str(queue.count_finished) + " requests.")
print("Crawler finished.")
print("Found " + str(queue.count_finished) + " requests.")
def cb_request_before_start(queue, queue_item):
print("Starting: {}".format(queue_item.request.url))
Expand Down
2 changes: 2 additions & 0 deletions docs/source/_templates/sidebar.html
Expand Up @@ -2,6 +2,7 @@ <h1>Introduction</h1>
<ul>
<li><a class="reference" href="index.html">Home</a></li>
<li><a class="reference" href="installation.html">Installation</a></li>
<li><a class="reference" href="installation.html">Changelog</a></li>
<li><a class="reference" href="getting_started.html">Getting started</a></li>
<li><a class="reference" href="kitchen_sink.html">Kitchen sink</a></li>
</ul>
Expand All @@ -11,6 +12,7 @@ <h1>Options</h1>
<li><a class="reference" href="options_crawling_scope.html">Crawling scope</a></li>
<li><a class="reference" href="options_crawling_identity.html">Crawling identity</a></li>
<li><a class="reference" href="options_performance.html">Performance</a></li>
<li><a class="reference" href="options_misc.html">Misc</a></li>
</ul>
<h1>API</h1>
<ul>
Expand Down
3 changes: 2 additions & 1 deletion docs/source/getting_started.rst
Expand Up @@ -21,7 +21,8 @@ Minimal implementation
print("Crawler started.")
def cb_crawler_after_finish(queue):
print("Crawler finished, found " + str(queue.count_finished) + " requests.")
print("Crawler finished.")
print("Found " + str(queue.count_finished) + " requests.")
def cb_request_before_start(queue, queue_item):
print("Starting: {}".format(queue_item.request.url))
Expand Down
27 changes: 23 additions & 4 deletions docs/source/kitchen_sink.rst
Expand Up @@ -12,12 +12,14 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
from nyawc.Crawler import Crawler
from nyawc.CrawlerActions import CrawlerActions
from nyawc.http.Request import Request
from requests.auth import HTTPBasicAuth
def cb_crawler_before_start():
print("Crawler started.")
def cb_crawler_after_finish(queue):
print("Crawler finished. Found " + str(queue.count_finished) + " requests.")
print("Crawler finished.")
print("Found " + str(queue.count_finished) + " requests.")
for queue_item in queue.get_all(QueueItem.STATUS_FINISHED).values():
print("[" + queue_item.request.method + "] " + queue_item.request.url + " (PostData: " + str(queue_item.request.data) + ")")
Expand Down Expand Up @@ -58,20 +60,37 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
# Scope options
options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
options.scope.subdomain_must_match = False # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
options.scope.tld_must_match = True # Only crawl pages with the same tld as the startpoint (e.g. only `.com`). Default is True.
options.scope.max_depth = None # The maximum search depth. 0 only crawls the start request. 1 will also crawl all the requests found on the start request. 2 goes one level deeper, and so on. Default is None (unlimited).
# Identity options
options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
options.identity.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with. Default is Chrome.
options.identity.proxies = {
# No authentication
# 'http': 'http://host:port',
# 'https': 'http://host:port',
# Basic authentication
# 'http': 'http://user:pass@host:port',
# 'https': 'https://user:pass@host:port',
# SOCKS
# 'http': 'socks5://user:pass@host:port',
# 'https': 'socks5://user:pass@host:port'
}
options.identity.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
})
# Performance options
options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.
# Misc options
options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
crawler = Crawler(options)
crawler.start_with(Request("https://finnwea.com/"))
52 changes: 52 additions & 0 deletions docs/source/migration.rst
@@ -0,0 +1,52 @@
Migration
=========

.. contents:: Table of Contents
:depth: 2
:local:

From 1.5 to 1.6
---------------

**Headers have default values and are case insensitive**

From now on the headers identity option has default values and is a case insensitive dict. When changing headers the ``.update()`` method should be used so the default headers remain.

.. code:: python
# Old
options.identity.headers = {
"User-Agent": "MyCustomUserAgent"
}
# New
options.identity.headers.update({
"User-Agent": "MyCustomUserAgent"
})
**New default user agent**

The default user agent for the crawler has changed. In version 1.5 it was a fake Chrome user agent and from now on it is ``nyawc/1.6.0 CPython/3.6.1 Windows/10`` based on the versions you use.

The Chrome user agent from version 1.5 can still be faked by using the code below.

.. code:: python
options.identity.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
})
From 1.4 to 1.5
---------------

**Renamed the domain must match scope option**

Since version 1.5 the domain_must_match option is now called hostname_must_match.

.. code:: python
# Old
Options().scope.domain_must_match = True/False
# New
Options().scope.hostname_must_match = True/False
71 changes: 65 additions & 6 deletions docs/source/options_crawling_identity.rst
Expand Up @@ -18,18 +18,49 @@ How to use identity options
options = Options()
options.identity.auth = HTTPBasicAuth('user', 'pass')
options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
options.identity.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
options.identity.proxies = {
# No authentication
# 'http': 'http://host:port',
# 'https': 'http://host:port',
# Basic authentication
# 'http': 'http://user:pass@host:port',
# 'https': 'https://user:pass@host:port',
# SOCKS
'http': 'socks5://user:pass@host:port',
'https': 'socks5://user:pass@host:port'
}
options.identity.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
})
crawler = Crawler(options)
crawler.start_with(Request("https://finnwea.com/"))
Available identity options
--------------------------

**Authentication**

Set the authentication for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/authentication/>`__ authentication for all the options. Default is None (no authentication).

You can find examples of different types of authentication below.

.. code:: python
from requests.auth import HTTPBasicAuth
options.identity.auth = HTTPBasicAuth('user', 'pass')
from requests.auth import HTTPDigestAuth
options.identity.auth = HTTPDigestAuth('user', 'pass')
from requests_oauthlib import OAuth1
options.identity.auth = OAuth1('YOUR_APP_KEY', 'YOUR_APP_SECRET', 'USER_OAUTH_TOKEN', 'USER_OAUTH_TOKEN_SECRET')
**Cookies**

Set custom cookies for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/quickstart/#cookies>`__ cookie jar for all the cookie options.
Expand All @@ -38,12 +69,40 @@ Set custom cookies for the crawler. Please check `python-requests <http://docs.p
options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
**Proxy**

Set a proxy for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__ proxies for all the proxy options. Default is None (no proxy).

You can find examples of different types of proxies below.

.. code:: python
# Without authentication
options.identity.proxies = {
'http': 'http://host:port',
'https': 'http://host:port'
}
# With basic authentication
options.identity.proxies = {
'http': 'http://user:pass@host:port',
'https': 'https://user:pass@host:port'
}
# With SOCKS
options.identity.proxies = {
'http': 'socks5://user:pass@host:port',
'https': 'socks5://user:pass@host:port'
}
**Headers**

Set custom headers for the crawler (as {key: value} object). For example, you can set a new user agent by using ``User-Agent`` as key, as shown below.
Set custom headers for the crawler (as {key: value} CaseInsensitiveDict). For example, you can set a new user agent by using ``User-Agent`` as key, as shown below.

Please note that you should use the ``.update()`` method so the default headers remain the same.

.. code:: python
options.identity.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with. Default is Chrome.
}
options.identity.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with.
})
33 changes: 33 additions & 0 deletions docs/source/options_misc.rst
@@ -0,0 +1,33 @@
Misc
====

.. contents:: Table of Contents
:depth: 2
:local:

How to use misc options
------------------------------

.. code:: python
# misc_example.py
from nyawc.Options import Options
from nyawc.Crawler import Crawler
from nyawc.http.Request import Request
options = Options()
options.misc.debug = False
crawler = Crawler(options)
crawler.start_with(Request("https://finnwea.com/"))
Available misc options
----------------------

**Debug**

If debug is enabled extra information will be logged to the console. Default is False.

``options.misc.debug = True``
4 changes: 2 additions & 2 deletions example.py
Expand Up @@ -104,8 +104,8 @@ def cb_form_after_autofill(queue_item, elements, form_data):
# Performance options
options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.

# Debug option
options.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
# Misc options
options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.

crawler = Crawler(options)
crawler.start_with(Request("https://finnwea.com/"))
2 changes: 1 addition & 1 deletion nyawc/CrawlerThread.py
Expand Up @@ -85,7 +85,7 @@ def run(self):
except Exception as e:
new_status = QueueItem.STATUS_ERRORED

if self.__options.debug:
if self.__options.misc.debug:
print("Setting status of '{}' to '{}' because of an HTTP error.".format(self.__queue_item.request.url, QueueItem.STATUS_ERRORED))
print(e)

Expand Down
25 changes: 17 additions & 8 deletions nyawc/Options.py
Expand Up @@ -35,7 +35,7 @@ class Options:
callbacks (:class:`nyawc.Options.OptionsCallbacks`): Can be used to define crawling callbacks.
performance (:class:`nyawc.Options.OptionsPerformance`): Can be used to define performance options.
identity (:class:`nyawc.Options.OptionsIdentity`): Can be used to define the identity/footprint options.
debug (bool): If debug is enabled extra information will be logged to the console. Default is False.
misc (:class:`nyawc.Options.OptionsMisc`): Can be used to define the other options.
"""

Expand All @@ -46,7 +46,7 @@ def __init__(self):
self.callbacks = OptionsCallbacks()
self.performance = OptionsPerformance()
self.identity = OptionsIdentity()
self.debug = False
self.misc = OptionsMisc()

class OptionsScope:
"""The OptionsScope class contains the scope options.
Expand Down Expand Up @@ -195,11 +195,20 @@ def __init__(self):
self.auth = None
self.cookies = requests.cookies.RequestsCookieJar()
self.headers = requests.utils.default_headers()
self.headers.update({
"User-Agent": user_agent("nyawc", semver.read())
})
self.proxies = {
self.headers.update({"User-Agent": user_agent("nyawc", semver.read())})
self.proxies = None

semver.close()

}
class OptionsMisc:
"""The OptionsMisc class contains all kind of misc options.
Attributes:
debug (bool): If debug is enabled extra information will be logged to the console. Default is False.
semver.close()
"""

def __init__(self):
"""Constructs an OptionsMisc instance."""

self.debug = False

0 comments on commit 7ebdebb

Please sign in to comment.