Skip to content

Commit

Permalink
Merge a616617 into 047a512
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Nov 30, 2023
2 parents 047a512 + a616617 commit 3342da2
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 4 deletions.
5 changes: 5 additions & 0 deletions docs/setup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ The following additional settings are recommended:

ItemAdapter.ADAPTER_CLASSES.appendleft(ZyteItemAdapter)

- Update :setting:`SPIDER_MIDDLEWARES <scrapy:SPIDER_MIDDLEWARES>` to include
``"zyte_spider_templates.middlewares.AllowOffsiteMiddleware": 500`` and
``"scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None``. This allows for
crawling item links outside of the domain.

For an example of a properly configured ``settings.py`` file, see `the one
in zyte-spider-templates-project`_.

Expand Down
15 changes: 14 additions & 1 deletion tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pydantic import ValidationError
from scrapy_poet import DummyResponse
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import Product, ProductNavigation
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation

from zyte_spider_templates import BaseSpiderParams
from zyte_spider_templates._geolocations import (
Expand Down Expand Up @@ -456,3 +456,16 @@ def test_metadata():
def test_validation_url(url, valid):
url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern
assert bool(re.match(url_re, url)) == valid


def test_get_parse_product_request():
base_kwargs = {
"url": "https://example.com",
}
crawler = get_crawler()

# Crawls products outside of domains by default
spider = EcommerceSpider.from_crawler(crawler, **base_kwargs)
request = ProbabilityRequest(url="https://example.com")
scrapy_request = spider.get_parse_product_request(request)
assert scrapy_request.meta.get("allow_offsite") is True
35 changes: 34 additions & 1 deletion tests/test_middlewares.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import pytest
from freezegun import freeze_time
from scrapy import Spider
from scrapy.http import Request, Response
from scrapy.statscollectors import StatsCollector
from scrapy.utils.test import get_crawler

from zyte_spider_templates.middlewares import CrawlingLogsMiddleware
from zyte_spider_templates.middlewares import (
AllowOffsiteMiddleware,
CrawlingLogsMiddleware,
)


@freeze_time("2023-10-10 20:09:29")
Expand Down Expand Up @@ -215,3 +222,29 @@ def results_gen():
" }\n"
"}"
)


@pytest.mark.parametrize(
"req,allowed",
(
(Request("https://example.com"), True),
(Request("https://outside-example.com"), False),
(Request("https://outside-example.com", meta={"allow_offsite": True}), True),
),
)
def test_item_offsite_middleware(req, allowed):
class TestSpider(Spider):
name = "test"
allowed_domains = ("example.com",)

spider = TestSpider()
crawler = get_crawler(TestSpider)
stats = StatsCollector(crawler)
middleware = AllowOffsiteMiddleware(stats)
middleware.spider_opened(spider)

result = list(middleware.process_spider_output(Response(""), [req], spider))
if allowed:
assert result == [req]
else:
assert result == []
12 changes: 11 additions & 1 deletion zyte_spider_templates/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from datetime import datetime
from typing import Any, Dict

from scrapy import Request
from scrapy import Request, Spider
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.spidermiddlewares.offsite import OffsiteMiddleware
from scrapy.utils.request import request_fingerprint

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -109,3 +110,12 @@ def crawl_logs(self, response, result):
json.dumps(data, indent=2),
]
return "\n".join(report)


class AllowOffsiteMiddleware(OffsiteMiddleware):
def _filter(self, request: Any, spider: Spider) -> bool:
if not isinstance(request, Request):
return True
if request.meta.get("allow_offsite"):
return True
return super()._filter(request, spider)
4 changes: 3 additions & 1 deletion zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def get_parse_product_request(

probability = request.get_probability()

return request.to_scrapy(
scrapy_request = request.to_scrapy(
callback=callback,
priority=priority,
meta={
Expand All @@ -181,3 +181,5 @@ def get_parse_product_request(
}
},
)
scrapy_request.meta["allow_offsite"] = True
return scrapy_request

0 comments on commit 3342da2

Please sign in to comment.