# Scraping Amazon best seller data

### Version 1.0 
### End on June 8th 2016
### Reference: Web Scraping with Python

In [1]:
import urllib2
import lxml.html
import pandas as pd
import time
import winsound

In [2]:
def download(url, user_agent = 'wswp', num_retries = 2):
    print 'Downloading:', url
    headers = {'User-agent': user_agent}
    url = urllib2.quote(url.encode('utf-8'), ":?=/")
    request = urllib2.Request(url, headers = headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'Download error:', e.reason
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries-1)
    return html

In [3]:
def get_item_20(raw_web):
    tree = lxml.html.fromstring(raw_web)
    # get col title
    title = [x.text_content() for x in tree.cssselect('.zg_title')]
    # get col title_url
    title_url = [x.get('href').lstrip() for x in lxml.etree.HTML(raw_web).xpath(u"//div[@class='zg_title']/a")]
    # get col reviews
    reviews = [x.text_content() for x in tree.cssselect('.zg_reviews')]
    reviews_clean = pd.Series(reviews).str.replace('\n', '').str.strip().str.split('[()]')
    reviews_new = []
    for x in reviews_clean:
        if len(x) > 1:
             reviews_new.append(x[1])
        else: 
             reviews_new.append(x[0])
    # get col star 
    star = []
    for x in reviews_clean:
        if len(x) > 1:
             star.append(x[0].split(' ')[0])
        else: 
             star.append(x[0])    
    # get col price 
    price = [x.text_content() for x in tree.cssselect('.zg_itemPriceBlock_compact > .zg_price')]
    price_clean = pd.Series(price).str.replace('\n', '').str.split(' ')
    price_new = []
    for x in price_clean:
        if len(x) > 1:
             price_new.append(x[-2])
        else: 
             price_new.append(x[-1])

    bestseller = pd.DataFrame(data = {'title' : title, 'title_url' : title_url, 'reviews' : reviews_new, 'star' : star, 'price' : price_new})
    return bestseller

In [4]:
def get_item_100(url_first, department):
    
    # url_first = download(url = url_page1)
    
    # get links page 2 to 5
    other_page = [x.get('href') for x in lxml.etree.HTML(url_first).xpath(u"//a[@page]")[1:]]
    
     # get the first 20 items
    output_page = get_item_20(url_first)

    # get the other 80 items
    for x in other_page:
        output_page = output_page.append(get_item_20(download(url = x)))
        
    # add department 
    output_page['department'] = department
    
    output_page.to_csv(path_or_buf = department + '.csv', encoding = 'utf-8')

In [6]:
def get_level(your_url):
    
    department = your_url.split('/')[3]
    
    # download the web page 1
    url_first_level = download(url = your_url)
    
    # get second level links
    links = [x.get('href') for x in lxml.etree.HTML(url_first_level).xpath(u"//ul[@id='zg_browseRoot']/ul/ul/li/a[@href]")]
    
    # get the 100 items
    get_item_100(url_first_level, department)

    # get the other sub department
    i = 0
    for x in links:
        print "Time Sleep Start : %s" % time.ctime()
        time.sleep( 5 )
        print "Time Sleep End : %s" % time.ctime()
        depart = x.split('/')[3]
        raw_url = download(x)
        get_item_100(raw_url, depart)
        i += 1
        print "Finshed %d sub department, have %d department need to deal with." % (i, len(links)-i)
        
    for n in range(3):
        winsound.PlaySound('alert', winsound.SND_ASYNC)
        time.sleep(0.5)

In [7]:
url = 'http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0'

In [8]:
get_level(url)

Downloading: http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0
Downloading: http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_home-garden_pg_2/179-3521453-9734723?_encoding=UTF8&pg=2
Downloading: http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_home-garden_pg_3/179-3521453-9734723?_encoding=UTF8&pg=3
Downloading: http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_home-garden_pg_4/179-3521453-9734723?_encoding=UTF8&pg=4
Downloading: http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_home-garden_pg_5/179-3521453-9734723?_encoding=UTF8&pg=5
Time Sleep Start : Wed Jun 08 22:30:39 2016
Time Sleep End : Wed Jun 08 22:30:44 2016
Downloading: http://www.amazon.com/Best-Sellers-Home-Kitchen-Kids-Store/zgbs/home-garden/3206325011/ref=zg_bs_nav_hg_1_hg/179-3521453-9734723
Downloading: http://www.amazon.com/Best-Sellers-Home-Kitchen-Kids-Store/zgbs/home-garden/320632501