Skip to content

Commit

Permalink
encapsulate out processors into LocationLoader class
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandr Nesterenko committed Jan 9, 2015
1 parent 0feded0 commit ffd0e36
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 37 deletions.
27 changes: 26 additions & 1 deletion trial/items.py
Expand Up @@ -4,6 +4,9 @@
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Identity, Compose


class LocationItem(Item):
city = Field()
Expand All @@ -22,6 +25,7 @@ class LocationItem(Item):
weekly_ad_url = Field()
zipcode = Field()


class ProductItem(Item):
currency = Field()
current_price = Field()
Expand All @@ -42,4 +46,25 @@ class ProductItem(Item):
rating = Field()
available_instore = Field()
available_online = Field()



_week = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday',
3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}


def _get_hours_item_value(time_pairs):
days = {}
for idx in _week:
if idx in time_pairs:
open, close = time_pairs[idx]
days[_week[idx]] = {'open': open, 'close': close}
return days


class LocationLoader(ItemLoader):
default_item_class = LocationItem

default_output_processor = TakeFirst()
address_out = Identity()
hours_out = Compose(lambda x: x[0], _get_hours_item_value)
services_out = Identity()
12 changes: 0 additions & 12 deletions trial/spiders/__init__.py
Expand Up @@ -2,15 +2,3 @@
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

_week = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday',
3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}


def get_hours_item_value(days_time_pairs):
days = {}
for idx in _week:
if idx in days_time_pairs:
open, close = days_time_pairs[idx]
days[_week[idx]] = {'open': open, 'close': close}
return days
16 changes: 4 additions & 12 deletions trial/spiders/applelocation.py
@@ -1,11 +1,9 @@
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Identity
from scrapy.contrib.loader.processor import TakeFirst

from trial.items import LocationItem
from . import get_hours_item_value
from trial.items import LocationLoader


class AppleLocationSpider(CrawlSpider):
Expand Down Expand Up @@ -40,7 +38,7 @@ def parse_store(self, response):
@returns items 1 1
@scrapes address phone_number services state store_image_url store_name store_id store_url weekly_ad_url zipcode
"""
il = ItemLoader(item=LocationItem(), response=response)
il = LocationLoader(response=response)
il.selector = response.xpath('(//address)[1]')

il.add_xpath('city', './/span[@class="locality"]/text()')
Expand Down Expand Up @@ -73,12 +71,6 @@ def parse_store(self, response):
il.add_xpath('zipcode', './/span[@class="postal-code"]/text()',
TakeFirst())

# output processors
il.default_output_processor = TakeFirst()
il.address_out = Identity()
il.hours_out = Identity()
il.services_out = Identity()

yield il.load_item()

def parse_hours(self, trs):
Expand Down Expand Up @@ -111,4 +103,4 @@ def parse_hours(self, trs):

for i in day_idxes:
days[i] = time_interval
return get_hours_item_value(days)
return days
16 changes: 4 additions & 12 deletions trial/spiders/wetseallocation.py
@@ -1,11 +1,9 @@
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import FormRequest
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import TakeFirst, Identity
from scrapy.contrib.loader.processor import TakeFirst

from trial.items import LocationItem
from . import get_hours_item_value
from trial.items import LocationLoader


class WetsealLocationSpider(CrawlSpider):
Expand Down Expand Up @@ -42,7 +40,7 @@ def parse_start_url(self, response):
def parse_stores(self, response):
"""Parse items"""
for tr in response.xpath('//table[@id="store-location-results"]/tbody/tr'):
il = ItemLoader(item=LocationItem(), response=response)
il = LocationLoader(response=response)

address_lines = tr.xpath('td[@class="store-address"]/text()')
il.add_value('phone_number', address_lines.pop().extract().strip())
Expand Down Expand Up @@ -74,12 +72,6 @@ def parse_stores(self, response):
restrict_xpaths='//a[@id="%s"]' % store_id).extract_links(response)[0].url)
# weekly_ad_url: not found

# output processors
il.default_output_processor = TakeFirst()
il.address_out = Identity()
il.hours_out = Identity()
il.services_out = Identity()

yield il.load_item()

def parse_hours(self, lines):
Expand All @@ -99,4 +91,4 @@ def parse_hours(self, lines):
idxes = [sitedays[pieces.pop(0)]]
for i in idxes:
days[i] = pieces
return get_hours_item_value(days)
return days

0 comments on commit ffd0e36

Please sign in to comment.