In [1]:
from lxml import html
import requests
from BeautifulSoup import BeautifulSoup
import re

In [2]:
# Return web link based on searching result, like searching for jacket
def get_page_url(key_words, page_num, proxy):
    search_key_words = 'http://www.amazon.com/s?url=search-alias%3Daps&field-keywords='
    return search_key_words + key_words + '&page=' + str(page_num)

In [3]:
# take a product's link page and returns the attributes of the product in a dictionary, including product name, category and price
def product_parser(url):
    # read a url page
    page = requests.get(url)
    tree = html.fromstring(page.content)

    # match the tags to find product attributes 
    name = tree.xpath('//h1[@id="title"]//text()')
    sale_price = tree.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()')
    original_price = tree.xpath('//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()')
    category = tree.xpath('//a[@class="a-link-normal a-color-tertiary"]//text()')
    availability = tree.xpath('//div[@id="availability"]//text()')

    # clean the results for each attribute
    name = ' '.join(''.join(name).split()) if name else None
    sale_price = float(''.join(sale_price).split()[0][1:]) if sale_price else None
    category = ' > '.join([i.strip() for i in category]) if category else None
    # original_price = float(''.join(sale_price).split()[0][1:]) if original_price else None
    availability = ''.join(availability).strip() if availability else None
    
    if not original_price:
        original_price = sale_price
        
    info = {
        'Name':name,
        'Sale price':sale_price,
        'Category':category,
        # 'Original price':original_price,
        'Availability':availability,
        #'URL':url,       
    }
    return info

In [6]:
# takes a web page which lists different products in one page, and returns a list of ASIN numbers from the page
def get_product_asin(page_url):
    asin = []
    soup = BeautifulSoup(requests.get(page_url).content)
    for p in soup.findAll('li',{'data-asin': re.compile('.{8}') }):
        asin.append(p['data-asin'])
    return asin

In [14]:
def page_parser(page_url):
    if len(page_url) == 0:
        return 'Empty URL'
    asin = get_product_asin(page_url)
    product_list = []
    for i in asin:
        # find a product web link based on its ASIN number
        product_link = 'https://www.amazon.com/dp/' + i
        # find the product's attributes based on the product link
        product_list.append(product_parser(product_link))
    return product_list

In [16]:
# Test Case
page_parser(get_page_url('jacket', 1, 0))

[u'B0058YHKDI',
 u'B0065Q0RL0',
 u'B00D7G6UXK',
 u'B01FFJN9GW',
 u'B072X91CJ3',
 u'B002MFW7D4',
 u'B01N3UENOB',
 u'B01N5V78VH',
 u'B00DQYWIHA',
 u'B0746CSC1H',
 u'B071X4CSSQ',
 u'B06XJ2699G',
 u'B00LEX0WRI',
 u'B06ZZ5676J',
 u'B008G4LKJ2',
 u'B06XDBSNJ1']