From f76a5f48a28992416a98e5a15783e7c6923aa254 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Sun, 4 Aug 2019 11:12:05 +0530 Subject: [PATCH 01/15] Intial Cerberus docs --- docs/source/actions.rst | 4 +- docs/source/getting-started.rst | 5 ++- docs/source/item-validation.rst | 76 +++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 4 deletions(-) diff --git a/docs/source/actions.rst b/docs/source/actions.rst index 65198084..730b5c19 100644 --- a/docs/source/actions.rst +++ b/docs/source/actions.rst @@ -475,7 +475,7 @@ SPIDERMON_REPORT_S3_REGION_ENDPOINT .. _actions-sentry-action: Sentry action -============ +============= This action allows you to send custom messages to `Sentry`_ when your monitor suites finish their execution. To use this action @@ -533,7 +533,7 @@ It could be set to any level provided by `Sentry Log Level`_ .. _SPIDERMON_SENTRY_FAKE: SPIDERMON_SENTRY_FAKE --------------------- +--------------------- Default: ``False`` diff --git a/docs/source/getting-started.rst b/docs/source/getting-started.rst index f1d3391f..de3d1252 100644 --- a/docs/source/getting-started.rst +++ b/docs/source/getting-started.rst @@ -229,8 +229,8 @@ Item validation Item validators allows you to match your returned items with predetermined structure ensuring that all fields contains data in the expected format. Spidermon allows -you to choose between schematics_ or `JSON Schema`_ to define the structure -of your item. +you to choose from schematics_,`JSON Schema`_ or `cerberus`_ to define structure and +validation tool needed for your item. In this tutorial, we will use a schematics_ model to make sure that all required fields are populated and they are all of the correct format. @@ -385,6 +385,7 @@ The resulted item will look like this: .. _`JSON Schema`: https://json-schema.org/ .. _`schematics`: https://schematics.readthedocs.io/en/latest/ +.. _`cerberus`: https://docs.python-cerberus.org/en/latest/index.html .. _`Scrapy`: https://scrapy.org/ .. _`Scrapy items`: https://docs.scrapy.org/en/latest/topics/items.html .. _`Scrapy Tutorial`: https://doc.scrapy.org/en/latest/intro/tutorial.html diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 38953056..17bee914 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -87,6 +87,48 @@ an example of a schema for the quotes item from the :doc:`tutorial `. +# TEST This will be tested in system testing and delete comment + + +.. code-block:: json + + { + "quote": { + "type": "string", + "required": "True" + }, + "author": { + "type": "string", + "required": "True" + }, + "author_url": { + "type": "string" + "regex": "" + }, + "tags": { + "type": "list" + } + } + + +To use Cerberus validation, you would need to add +:ref:`SPIDERMON_VALIDATION_CERBERUS` setting to your `settings.py` + Settings -------- @@ -193,6 +235,37 @@ as a `dict`: OtherItem: '/path/to/otheritem_schema.json', } +.. _SPIDERMON_VALIDATION_CERBERUS: + +SPIDERMON_VALIDATION_CERBERUS +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Default: ``None`` + +A `list` containing the local path of the item schema. + +.. code-block:: python + + # settings.py + + SPIDERMON_VALIDATION_CERBERUS: [ + '/path/to/schema.json', + # ADD URL FUNCTIONALITY, delete comment + ] + +If you are working on a spider that produces multiple items types, you can define it +as a `dict`: + +.. code-block:: python + + # settings.py + + SPIDERMON_VALIDATION_SCHEMAS: { + # Check this FUNCTIONALITY and delete comment + DummyItem: '/path/to/dummyitem_schema.json', + OtherItem: '/path/to/otheritem_schema.json', + } + Validation in Monitors ---------------------- @@ -238,3 +311,6 @@ Some examples: .. _`guide`: http://json-schema.org/learn/getting-started-step-by-step.html .. _`schematics models`: https://schematics.readthedocs.io/en/latest/usage/models.html .. _`jsonschema`: https://pypi.org/project/jsonschema/ +.. _`cerberus`: https://pypi.org/project/Cerberus/ +.. _`usage`: http://docs.python-cerberus.org/en/latest/usage.html +.. _`validation-rules`: http://docs.python-cerberus.org/en/latest/validation-rules.html From bbfb8eb31952b516bcff5764e4c9a5c491c96e76 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Sun, 4 Aug 2019 14:40:51 +0530 Subject: [PATCH 02/15] Fix JSON schema --- docs/source/item-validation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 17bee914..0d5bab14 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -117,7 +117,7 @@ schema. Here we have an example of a schema for the quotes item from the "required": "True" }, "author_url": { - "type": "string" + "type": "string", "regex": "" }, "tags": { From 4e22ed721aa98994d4afbdf973a58cf60792f195 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Sun, 4 Aug 2019 15:08:40 +0530 Subject: [PATCH 03/15] Add functionality, deleted comments Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 0d5bab14..fbb4bb87 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -108,23 +108,11 @@ schema. Here we have an example of a schema for the quotes item from the .. code-block:: json { - "quote": { - "type": "string", - "required": "True" - }, - "author": { - "type": "string", - "required": "True" - }, - "author_url": { - "type": "string", - "regex": "" - }, - "tags": { - "type": "list" - } - } - + "quote": {"type": "string", "required": True}, + "author": {"type": "string", "required": True}, + "author_url": {"type": "string", "regex": ""}, + "tags": {"type": "list"}, +} To use Cerberus validation, you would need to add :ref:`SPIDERMON_VALIDATION_CERBERUS` setting to your `settings.py` From 55b28520862305a99a020d9c3e1534369036b9dc Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Sun, 4 Aug 2019 16:19:49 +0530 Subject: [PATCH 04/15] Final Changes Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index fbb4bb87..abde5915 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -107,11 +107,11 @@ schema. Here we have an example of a schema for the quotes item from the .. code-block:: json - { - "quote": {"type": "string", "required": True}, - "author": {"type": "string", "required": True}, - "author_url": {"type": "string", "regex": ""}, - "tags": {"type": "list"}, +{ + "quote": {"type": "string", "required": true}, + "author": {"type": "string", "required": true}, + "author_url": {"type": "string"}, + "tags": {"type": "list"} } To use Cerberus validation, you would need to add From ad9f2895ba862d5f8f147625b7faa30851f1ae82 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Sun, 4 Aug 2019 16:28:05 +0530 Subject: [PATCH 05/15] Last final changes Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index abde5915..4adf8237 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -102,8 +102,6 @@ each field, as well as dependencies and regex. This `usage`_ and `validation-rules`_ guide explains main keywords and how to make a schema. Here we have an example of a schema for the quotes item from the :doc:`tutorial `. -# TEST This will be tested in system testing and delete comment - .. code-block:: json @@ -238,7 +236,7 @@ A `list` containing the local path of the item schema. SPIDERMON_VALIDATION_CERBERUS: [ '/path/to/schema.json', - # ADD URL FUNCTIONALITY, delete comment + 'https://api.myjson.com/bins/gmdgl' ] If you are working on a spider that produces multiple items types, you can define it From b43986a674ec9577c6d79a3daaf65cae113582f7 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Sun, 4 Aug 2019 16:39:59 +0530 Subject: [PATCH 06/15] Fix Cerberus Schema Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 4adf8237..81c02171 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -247,7 +247,6 @@ as a `dict`: # settings.py SPIDERMON_VALIDATION_SCHEMAS: { - # Check this FUNCTIONALITY and delete comment DummyItem: '/path/to/dummyitem_schema.json', OtherItem: '/path/to/otheritem_schema.json', } From fc7f1f5fe4fdfc6565b75056ade4981055257be2 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Tue, 6 Aug 2019 19:44:16 +0530 Subject: [PATCH 07/15] Suggestions implemented as suggested Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 81c02171..5000b797 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -236,7 +236,7 @@ A `list` containing the local path of the item schema. SPIDERMON_VALIDATION_CERBERUS: [ '/path/to/schema.json', - 'https://api.myjson.com/bins/gmdgl' + 'http://example.com/mycerberusschema' ] If you are working on a spider that produces multiple items types, you can define it From d711e53dc4771c30a4219d090a06822a767fa05a Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Mon, 12 Aug 2019 17:47:45 +0530 Subject: [PATCH 08/15] Improve Docs Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 5000b797..a353b172 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -99,7 +99,7 @@ each field, as well as dependencies and regex. You need to install `cerberus`_ to use this feature. -This `usage`_ and `validation-rules`_ guide explains main keywords and how to make a +This `usage`_ and `validation-rules`_ guide explain the main keywords and how to make a schema. Here we have an example of a schema for the quotes item from the :doc:`tutorial `. @@ -236,7 +236,8 @@ A `list` containing the local path of the item schema. SPIDERMON_VALIDATION_CERBERUS: [ '/path/to/schema.json', - 'http://example.com/mycerberusschema' + 'http://example.com/mycerberusschema', + {"Field": {"type": "number", "required":True}} ] If you are working on a spider that produces multiple items types, you can define it @@ -246,7 +247,7 @@ as a `dict`: # settings.py - SPIDERMON_VALIDATION_SCHEMAS: { + SPIDERMON_VALIDATION_CERBERUS: { DummyItem: '/path/to/dummyitem_schema.json', OtherItem: '/path/to/otheritem_schema.json', } From d37aa8f86825eaa9adf46ea2d218939f732f43e7 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Thu, 15 Aug 2019 11:07:29 +0530 Subject: [PATCH 09/15] Add indentation to code block Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index a353b172..dc29cf26 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -105,12 +105,12 @@ schema. Here we have an example of a schema for the quotes item from the .. code-block:: json -{ - "quote": {"type": "string", "required": true}, - "author": {"type": "string", "required": true}, - "author_url": {"type": "string"}, - "tags": {"type": "list"} -} + { + "quote": {"type": "string", "required": true}, + "author": {"type": "string", "required": true}, + "author_url": {"type": "string"}, + "tags": {"type": "list"} + } To use Cerberus validation, you would need to add :ref:`SPIDERMON_VALIDATION_CERBERUS` setting to your `settings.py` From bcc7295fd131e76111727fc631ad3fcc41f29830 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Thu, 15 Aug 2019 17:57:13 +0530 Subject: [PATCH 10/15] Add Multiple Item example Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 98bbaf97..741b689a 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -240,13 +240,36 @@ A `list` containing the local path of the item schema. {"Field": {"type": "number", "required":True}} ] -If you are working on a spider that produces multiple items types, you can define it -as a `dict`: +If you are working on a spider that produces multiple items types, you can define paths to schema for each item as `dict` like the example below: + +.. code-block:: python + + # Sample quotes spider yielding two items types + + import scrapy + from quotes.items import DummyItem, OtherItem + + class QuotesToscrapeComSpider(scrapy.Spider): + name = "quotes.toscrape.com" + allowed_domains = ["quotes.toscrape.com"] + start_urls = ["http://quotes.toscrape.com/"] + + def parse(self, response): + x = DummyItem() + y = OtherItem() + for quote in response.css(".quote"): + x['quote'] = quote.css(".text::text").get() + y['author'] = quote.css(".author::text").get() + yield x + yield y + .. code-block:: python # settings.py + from quotes.items import DummyItem, OtherItem + SPIDERMON_VALIDATION_CERBERUS: { DummyItem: '/path/to/dummyitem_schema.json', OtherItem: '/path/to/otheritem_schema.json', From dbfbf9c7f4d5fa3a450752a3aa3944101311a8e0 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Fri, 16 Aug 2019 00:28:01 +0530 Subject: [PATCH 11/15] Add mention of Cerberus in L20 Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 741b689a..0360cc43 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -18,7 +18,7 @@ the first step is to enable the built-in item pipeline in your project settings: } After that, you need to choose which validation library will be used. Spidermon -accepts schemas defined using schematics_ or `JSON Schema`_. +accepts schemas defined using schematics_, `JSON Schema`_ or cerberus_. With schematics --------------- From 35c9cbeeaf258ba97beff4d9eb47045440e5a33e Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Tue, 20 Aug 2019 01:41:58 +0530 Subject: [PATCH 12/15] Replace : with = Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 0360cc43..9811687c 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -234,7 +234,7 @@ A `list` containing the local path of the item schema. # settings.py - SPIDERMON_VALIDATION_CERBERUS: [ + SPIDERMON_VALIDATION_CERBERUS = [ '/path/to/schema.json', 'http://example.com/mycerberusschema', {"Field": {"type": "number", "required":True}} @@ -270,7 +270,7 @@ If you are working on a spider that produces multiple items types, you can defin from quotes.items import DummyItem, OtherItem - SPIDERMON_VALIDATION_CERBERUS: { + SPIDERMON_VALIDATION_CERBERUS = { DummyItem: '/path/to/dummyitem_schema.json', OtherItem: '/path/to/otheritem_schema.json', } From 2778b9e6248082f50f1891aef8ccdf27dd567118 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Tue, 20 Aug 2019 01:46:06 +0530 Subject: [PATCH 13/15] Add ... as seperators Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 9811687c..13411ded 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -270,6 +270,8 @@ If you are working on a spider that produces multiple items types, you can defin from quotes.items import DummyItem, OtherItem + ... + SPIDERMON_VALIDATION_CERBERUS = { DummyItem: '/path/to/dummyitem_schema.json', OtherItem: '/path/to/otheritem_schema.json', From 039399d523a353ccb1e8b2dba07c34ba40da07d9 Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Tue, 20 Aug 2019 01:46:33 +0530 Subject: [PATCH 14/15] Ready to go Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 13411ded..9811687c 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -270,8 +270,6 @@ If you are working on a spider that produces multiple items types, you can defin from quotes.items import DummyItem, OtherItem - ... - SPIDERMON_VALIDATION_CERBERUS = { DummyItem: '/path/to/dummyitem_schema.json', OtherItem: '/path/to/otheritem_schema.json', From 5c62b90db925a7add410e0c7ffcd7059214a0c0c Mon Sep 17 00:00:00 2001 From: "Vipul Gupta (@vipulgupta2048)" Date: Tue, 20 Aug 2019 18:44:02 +0530 Subject: [PATCH 15/15] Example removed Signed-off-by: Vipul Gupta (@vipulgupta2048) --- docs/source/item-validation.rst | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/docs/source/item-validation.rst b/docs/source/item-validation.rst index 9811687c..1d1a98f5 100644 --- a/docs/source/item-validation.rst +++ b/docs/source/item-validation.rst @@ -240,31 +240,7 @@ A `list` containing the local path of the item schema. {"Field": {"type": "number", "required":True}} ] -If you are working on a spider that produces multiple items types, you can define paths to schema for each item as `dict` like the example below: - -.. code-block:: python - - # Sample quotes spider yielding two items types - - import scrapy - from quotes.items import DummyItem, OtherItem - - class QuotesToscrapeComSpider(scrapy.Spider): - name = "quotes.toscrape.com" - allowed_domains = ["quotes.toscrape.com"] - start_urls = ["http://quotes.toscrape.com/"] - - def parse(self, response): - x = DummyItem() - y = OtherItem() - for quote in response.css(".quote"): - x['quote'] = quote.css(".text::text").get() - y['author'] = quote.css(".author::text").get() - yield x - yield y - - -.. code-block:: python +If you are working on a spider that produces multiple items types, you can define paths to schema for each item as `dict` as shown below: # settings.py