# Amazon Web Scraper
### Requirements:
- Selenium
- BeautifulSoup

In [1]:
import csv
from bs4 import BeautifulSoup

In [2]:
# Firefox and Chrome
from selenium import webdriver

## Startup the webdriver

In [3]:
# Firefox & chrome
driver = webdriver.Chrome()

In [4]:
url = 'https://www.amazon.com'
driver.get(url)

In [5]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

In [6]:
url = get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&ref=nb_sb_noss_1


In [7]:
driver.get(url)

## Extract the collection

In [8]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [9]:
results = soup.find_all('div', {'data-component-type': 's-search-result'})

In [10]:
len(results)

22

## Prototype the record

In [11]:
item = results[0]

In [12]:
atag = item.h2.a

In [13]:
description = atag.text.strip().replace(u'\xa0', u' ')

In [14]:
url = 'https://www.amazon.com' + atag.get('href')

In [15]:
price_parent = item.find('span', 'a-price')

In [16]:
price = price_parent.find('span', 'a-offscreen').text

In [17]:
rating = item.i.text

In [21]:
review_count = item.find('span', {'class': 'a-size-base'}).text

In [22]:
review_count

'31'

## Generalise the pattern

In [19]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip().replace(u'\xa0', u' ')
    url = 'https://www.amazon.com' + atag.get('href')
    
    # price
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
    
    # rank and rating
    rating = item.i.text
    review_count = item.find('span', {'class': 'a-size-base a-color-base s-underline-text'}).text
    
    result = (description, price, rating, review_count, url)
    return result

In [20]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'text'

## Error handling

In [39]:
def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip().replace(u'\xa0', u' ')
    url = 'https://www.amazon.com' + atag.get('href')
    
    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base a-color-base s-underline-text'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    
    result = (description, price, rating, review_count, url)
    return result

In [41]:
records = []
results = soup.find_all('div', {'data-component-type': 's-search-result'})

for item in results:
    record = extract_record(item)
    if record:
        records.append(record)

In [42]:
records[0]

('SAMSUNG 34 Inch Ultrawide QHD Curved Monitor, 100hz, 1440p Monitor, Computer Monitor, 1000R Curved, USB-C, HDR10, Vertical Monitor, TUV-Certified Intelligent Eye Care, S65UA (LS34A650UXNXGO)',
 '$649.99',
 '4.1 out of 5 stars',
 '34',
 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A001299824R6GD09EHQKN&url=%2FSAMSUNG-Adjustable-TUV-Certified-Intelligent-LS34A650UXNXGO%2Fdp%2FB08V71HXY3%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1635857315%26sr%3D8-1-spons%26psc%3D1&qualifier=1635857315&id=3153756395255054&widgetName=sp_atf')

In [43]:
for row in records:
    print(row[1])

$649.99
$315.07
$249.99
$357.26
$309.99
$239.99
$499.99
$549.99
$32.99
$399.99
$249.99
$598.99
$429.97
$549.99
$397.99
$687.65
$329.99
$149.99
$279.23


## Getting the next page

In [44]:
def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

    # add term query to url
    url = template.format(search_term)
    
    # add place query to placeholder
    url += '&page{}'
    
    return url

## Putting all together

In [23]:
import csv
from bs4 import BeautifulSoup

# Firefox and Chrome
from selenium import webdriver


def get_url(search_term):
    """Generate a url from search term"""
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ', '+')
    return template.format(search_term)

    # add term query to url
    url = template.format(search_term)
    
    # add place query to placeholder
    url += '&page{}'
    
    return url

def extract_record(item):
    """Extract and return data from a single record"""
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip().replace(u'\xa0', u' ')
    url = 'https://www.amazon.com' + atag.get('href')
    
    try:
        # price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count = item.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        rating = ''
        review_count = ''
    
    result = (description, price, rating, review_count, url)
    return result

def main(search_term):
    """Run main program routine"""
    # start the webdriver
    driver = webdriver.Chrome()
    
    records = []
    url = get_url(search_term)
    
    for page in range(1, 21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        
        for item in results:
            record = extract_record(item)
            if record:
                records.append(record)
    
    driver.close()
    
    # save data to csv file
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description', 'Price', 'Rating', 'ReviewCount', 'Url'])
        writer.writerows(records)

In [24]:
main('ultrawide monitor')