Documentation for version 1.6

tijme · Jun 28, 2017 · 7ebdebb · 7ebdebb
1 parent b5087b0
commit 7ebdebb
Show file tree

Hide file tree

Showing 10 changed files with 199 additions and 23 deletions.
diff --git a/README.rst b/README.rst
@@ -97,7 +97,8 @@ You can also use the `kitchen sink <https://tijme.github.io/not-your-average-web
         print("Crawler started.")
 
     def cb_crawler_after_finish(queue):
-        print("Crawler finished, found " + str(queue.count_finished) + " requests.")
+        print("Crawler finished.")
+        print("Found " + str(queue.count_finished) + " requests.")
 
     def cb_request_before_start(queue, queue_item):
         print("Starting: {}".format(queue_item.request.url))

diff --git a/docs/source/_templates/sidebar.html b/docs/source/_templates/sidebar.html
@@ -2,6 +2,7 @@ <h1>Introduction</h1>
 <ul>
 	<li><a class="reference" href="index.html">Home</a></li>
 	<li><a class="reference" href="installation.html">Installation</a></li>
+	<li><a class="reference" href="installation.html">Changelog</a></li>
 	<li><a class="reference" href="getting_started.html">Getting started</a></li>
 	<li><a class="reference" href="kitchen_sink.html">Kitchen sink</a></li>
 </ul>
@@ -11,6 +12,7 @@ <h1>Options</h1>
 	<li><a class="reference" href="options_crawling_scope.html">Crawling scope</a></li>
 	<li><a class="reference" href="options_crawling_identity.html">Crawling identity</a></li>
 	<li><a class="reference" href="options_performance.html">Performance</a></li>
+	<li><a class="reference" href="options_misc.html">Misc</a></li>
 </ul>
 <h1>API</h1>
 <ul>

diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
@@ -21,7 +21,8 @@ Minimal implementation
         print("Crawler started.")
 
     def cb_crawler_after_finish(queue):
-        print("Crawler finished, found " + str(queue.count_finished) + " requests.")
+        print("Crawler finished.")
+        print("Found " + str(queue.count_finished) + " requests.")
 
     def cb_request_before_start(queue, queue_item):
         print("Starting: {}".format(queue_item.request.url))

diff --git a/docs/source/kitchen_sink.rst b/docs/source/kitchen_sink.rst
@@ -12,12 +12,14 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
     from nyawc.Crawler import Crawler
     from nyawc.CrawlerActions import CrawlerActions
     from nyawc.http.Request import Request
+    from requests.auth import HTTPBasicAuth
 
     def cb_crawler_before_start():
         print("Crawler started.")
 
     def cb_crawler_after_finish(queue):
-        print("Crawler finished. Found " + str(queue.count_finished) + " requests.")
+        print("Crawler finished.")
+        print("Found " + str(queue.count_finished) + " requests.")
 
         for queue_item in queue.get_all(QueueItem.STATUS_FINISHED).values():
             print("[" + queue_item.request.method + "] " + queue_item.request.url + " (PostData: " + str(queue_item.request.data) + ")")
@@ -58,20 +60,37 @@ The English phrase "Everything but the kitchen sink" means "almost anything one
 
     # Scope options
     options.scope.protocol_must_match = False # Only crawl pages with the same protocol as the startpoint (e.g. only https). Default is False.
-    options.scope.subdomain_must_match = False # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
+    options.scope.subdomain_must_match = True # Only crawl pages with the same subdomain as the startpoint. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
     options.scope.hostname_must_match = True # Only crawl pages with the same hostname as the startpoint (e.g. only `finnwea`). Default is True.
     options.scope.tld_must_match = True # Only crawl pages with the same tld as the startpoint (e.g. only `.com`). Default is True.
     options.scope.max_depth = None # The maximum search depth. 0 only crawls the start request. 1 will also crawl all the requests found on the start request. 2 goes one level deeper, and so on. Default is None (unlimited).
 
     # Identity options
+    options.identity.auth = HTTPBasicAuth('user', 'pass') # Or any other authentication (http://docs.python-requests.org/en/master/user/authentication/). Default is None.
     options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
     options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
-    options.identity.headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with. Default is Chrome.
+    options.identity.proxies = {
+        # No authentication
+        # 'http': 'http://host:port',
+        # 'https': 'http://host:port',
+
+        # Basic authentication
+        # 'http': 'http://user:pass@host:port',
+        # 'https': 'https://user:pass@host:port',
+
+        # SOCKS
+        # 'http': 'socks5://user:pass@host:port',
+        # 'https': 'socks5://user:pass@host:port'
     }
+    options.identity.headers.update({
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
+    })
 
     # Performance options
     options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.
 
+    # Misc options
+    options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
+
     crawler = Crawler(options)
     crawler.start_with(Request("https://finnwea.com/"))
diff --git a/docs/source/migration.rst b/docs/source/migration.rst
@@ -0,0 +1,52 @@
+Migration
+=========
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+From 1.5 to 1.6
+---------------
+
+**Headers have default values and are case insensitive**
+
+From now on the headers identity option has default values and is a case insensitive dict. When changing headers the ``.update()`` method should be used so the default headers remain.
+
+.. code:: python
+
+    # Old
+    options.identity.headers = {
+        "User-Agent": "MyCustomUserAgent"
+    }
+
+    # New
+    options.identity.headers.update({
+        "User-Agent": "MyCustomUserAgent"
+    })
+
+**New default user agent**
+
+The default user agent for the crawler has changed. In version 1.5 it was a fake Chrome user agent and from now on it is ``nyawc/1.6.0 CPython/3.6.1 Windows/10`` based on the versions you use.
+
+The Chrome user agent from version 1.5 can still be faked by using the code below.
+
+.. code:: python
+
+    options.identity.headers.update({
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
+    })
+
+From 1.4 to 1.5
+---------------
+
+**Renamed the domain must match scope option**
+
+Since version 1.5 the domain_must_match option is now called hostname_must_match.
+
+.. code:: python
+
+    # Old
+    Options().scope.domain_must_match = True/False
+
+    # New
+    Options().scope.hostname_must_match = True/False
diff --git a/docs/source/options_crawling_identity.rst b/docs/source/options_crawling_identity.rst
@@ -18,18 +18,49 @@ How to use identity options
 
     options = Options()
 
+    options.identity.auth = HTTPBasicAuth('user', 'pass')
     options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
     options.identity.cookies.set(name='gross_cookie', value='blech', domain='finnwea.com', path='/elsewhere')
-    options.identity.headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
+    options.identity.proxies = {
+        # No authentication
+        # 'http': 'http://host:port',
+        # 'https': 'http://host:port',
+
+        # Basic authentication
+        # 'http': 'http://user:pass@host:port',
+        # 'https': 'https://user:pass@host:port',
+
+        # SOCKS
+        'http': 'socks5://user:pass@host:port',
+        'https': 'socks5://user:pass@host:port'
     }
+    options.identity.headers.update({
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
+    })
 
     crawler = Crawler(options)
     crawler.start_with(Request("https://finnwea.com/"))
 
 Available identity options
 --------------------------
 
+**Authentication**
+
+Set the authentication for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/authentication/>`__ authentication for all the options. Default is None (no authentication).
+
+You can find examples of different types of authentication below.
+
+.. code:: python
+
+    from requests.auth import HTTPBasicAuth
+    options.identity.auth = HTTPBasicAuth('user', 'pass')
+
+    from requests.auth import HTTPDigestAuth
+    options.identity.auth = HTTPDigestAuth('user', 'pass')
+
+    from requests_oauthlib import OAuth1
+    options.identity.auth = OAuth1('YOUR_APP_KEY', 'YOUR_APP_SECRET', 'USER_OAUTH_TOKEN', 'USER_OAUTH_TOKEN_SECRET')
+
 **Cookies**
 
 Set custom cookies for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/quickstart/#cookies>`__ cookie jar for all the cookie options.
@@ -38,12 +69,40 @@ Set custom cookies for the crawler. Please check `python-requests <http://docs.p
 
     options.identity.cookies.set(name='tasty_cookie', value='yum', domain='finnwea.com', path='/cookies')
 
+**Proxy**
+
+Set a proxy for the crawler. Please check `python-requests <http://docs.python-requests.org/en/master/user/advanced/#proxies>`__ proxies for all the proxy options. Default is None (no proxy).
+
+You can find examples of different types of proxies below.
+
+.. code:: python
+
+    # Without authentication
+    options.identity.proxies = {
+        'http': 'http://host:port',
+        'https': 'http://host:port'
+    }
+
+    # With basic authentication
+    options.identity.proxies = {
+        'http': 'http://user:pass@host:port',
+        'https': 'https://user:pass@host:port'
+    }
+
+    # With SOCKS
+    options.identity.proxies = {
+        'http': 'socks5://user:pass@host:port',
+        'https': 'socks5://user:pass@host:port'
+    }
+
 **Headers**
 
-Set custom headers for the crawler (as {key: value} object). For example, you can set a new user agent by using ``User-Agent`` as key, as shown below.
+Set custom headers for the crawler (as {key: value} CaseInsensitiveDict). For example, you can set a new user agent by using ``User-Agent`` as key, as shown below.
+
+Please note that you should use the ``.update()`` method so the default headers remain the same.
 
 .. code:: python
 
-    options.identity.headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with. Default is Chrome.    
-    }
+    options.identity.headers.update({
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" # The user agent to make requests with.
+    })
diff --git a/docs/source/options_misc.rst b/docs/source/options_misc.rst
@@ -0,0 +1,33 @@
+Misc
+====
+
+.. contents:: Table of Contents
+   :depth: 2
+   :local:
+
+How to use misc options
+------------------------------
+
+.. code:: python
+
+    # misc_example.py
+
+    from nyawc.Options import Options
+    from nyawc.Crawler import Crawler
+    from nyawc.http.Request import Request
+
+    options = Options()
+
+    options.misc.debug = False
+
+    crawler = Crawler(options)
+    crawler.start_with(Request("https://finnwea.com/"))
+
+Available misc options
+----------------------
+
+**Debug**
+
+If debug is enabled extra information will be logged to the console. Default is False.
+
+``options.misc.debug = True``
diff --git a/example.py b/example.py
@@ -104,8 +104,8 @@ def cb_form_after_autofill(queue_item, elements, form_data):
 # Performance options
 options.performance.max_threads = 10 # The maximum amount of simultaneous threads to use for crawling. Default is 8.
 
-# Debug option
-options.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
+# Misc options
+options.misc.debug = False # If debug is enabled extra information will be logged to the console. Default is False.
 
 crawler = Crawler(options)
 crawler.start_with(Request("https://finnwea.com/"))
diff --git a/nyawc/CrawlerThread.py b/nyawc/CrawlerThread.py
@@ -85,7 +85,7 @@ def run(self):
         except Exception as e:
             new_status = QueueItem.STATUS_ERRORED
 
-            if self.__options.debug:
+            if self.__options.misc.debug:
                 print("Setting status of '{}' to '{}' because of an HTTP error.".format(self.__queue_item.request.url, QueueItem.STATUS_ERRORED))
                 print(e)
 

diff --git a/nyawc/Options.py b/nyawc/Options.py
@@ -35,7 +35,7 @@ class Options:
         callbacks (:class:`nyawc.Options.OptionsCallbacks`): Can be used to define crawling callbacks.
         performance (:class:`nyawc.Options.OptionsPerformance`): Can be used to define performance options.
         identity (:class:`nyawc.Options.OptionsIdentity`): Can be used to define the identity/footprint options.
-        debug (bool): If debug is enabled extra information will be logged to the console. Default is False.
+        misc (:class:`nyawc.Options.OptionsMisc`): Can be used to define the other options.
 
     """
 
@@ -46,7 +46,7 @@ def __init__(self):
         self.callbacks = OptionsCallbacks()
         self.performance = OptionsPerformance()
         self.identity = OptionsIdentity()
-        self.debug = False
+        self.misc = OptionsMisc()
 
 class OptionsScope:
     """The OptionsScope class contains the scope options.
@@ -195,11 +195,20 @@ def __init__(self):
         self.auth = None
         self.cookies = requests.cookies.RequestsCookieJar()
         self.headers = requests.utils.default_headers()
-        self.headers.update({
-            "User-Agent": user_agent("nyawc", semver.read())
-        })
-        self.proxies = {
+        self.headers.update({"User-Agent": user_agent("nyawc", semver.read())})
+        self.proxies = None
+
+        semver.close()
 
-        }
+class OptionsMisc:
+    """The OptionsMisc class contains all kind of misc options.
+
+    Attributes:
+        debug (bool): If debug is enabled extra information will be logged to the console. Default is False.
 
-        semver.close()
+    """
+
+    def __init__(self):
+        """Constructs an OptionsMisc instance."""
+
+        self.debug = False