### Amazon web scraping

#### requirements: 
#1. Selenium
#2. BeautifulSoup

In [37]:
# !pip install selenium
# !pip install BeautifulSoup
from bs4 import BeautifulSoup

In [10]:

# firefox & chrome
from selenium import webdriver

# microsoft edge
# from msedge.selenium_tools import Edge, EdgeOptions

In [15]:
# Startup the webdriver
# firefox & chrome
driver = webdriver.Chrome(executable_path=r"D:\Programs\chromedriver.exe")

In [18]:
url = 'https://www.amazon.com'
driver.get(url)

In [20]:
def get_url(search_term):
    '''Generate a url from search term'''
    template = 'https://www.amazon.com/s?k={}&crid=3HANHGCT8DU6Q&sprefix=ultr%2Caps%2C238&ref=nb_sb_ss_ts-a-p_1_4'
    search_term = search_term.replace(' ','+')
    return template.format(search_term)

In [33]:
url = get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&crid=3HANHGCT8DU6Q&sprefix=ultr%2Caps%2C238&ref=nb_sb_ss_ts-a-p_1_4


In [60]:
driver.get(url)

#### Extract the collection


In [39]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [41]:
results = soup.find_all('div',{'data-component-type':'s-search-result'})
len(results)

22

#### Prototype Record

In [44]:
item = results[0]

In [54]:
atag= item.h2.a
atag

<a class="a-link-normal a-text-normal" href="/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&amp;adId=A06897513SCFNU00XHI0Q&amp;url=%2FLG-34WN650-W-34-Inch-UltraWide-DisplayHDR%2Fdp%2FB087JB656Q%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610172179%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1&amp;qualifier=1610172179&amp;id=6646012076797055&amp;widgetName=sp_atf">
<span class="a-size-medium a-color-base a-text-normal" dir="auto">LG 34WN650-W 34-Inch 21:9 UltraWide Full HD (2560 x 1080) IPS Display with VESA DisplayHDR 400 and AMD FreeSync, Silver</span>
</a>

In [53]:
description = atag.text.strip()
description

'LG 34WN650-W 34-Inch 21:9 UltraWide Full HD (2560 x 1080) IPS Display with VESA DisplayHDR 400 and AMD FreeSync, Silver'

In [55]:
url = 'https://www.amazon.com'+atag.get('href')
url

'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A06897513SCFNU00XHI0Q&url=%2FLG-34WN650-W-34-Inch-UltraWide-DisplayHDR%2Fdp%2FB087JB656Q%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610172179%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1&qualifier=1610172179&id=6646012076797055&widgetName=sp_atf'

In [57]:
price_parent = item.find('span','a-price')
price_parent

<span class="a-price" data-a-color="base" data-a-size="l"><span class="a-offscreen">$346.99</span><span aria-hidden="true"><span class="a-price-symbol">$</span><span class="a-price-whole">346<span class="a-price-decimal">.</span></span><span class="a-price-fraction">99</span></span></span>

In [76]:
price =price_parent.find('span','a-offscreen').text
price

'$346.99'

rating = item.i.text
rating

In [70]:
review_count = item.find('span',{'class':'a-size-base','dir':'auto'}).text
review_count

'189'

### Generalized the pattern

In [78]:
def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag= item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com'+atag.get('href')

    # price
    price_parent = item.find('span','a-price')
    price =price_parent.find('span','a-offscreen').text    
    
    # rank and rating
    rating = item.i.text
    review_count = item.find('span',{'class':'a-size-base','dir':'auto'}).text

    result = {description,price,rating,review_count,url}
    
    return result

In [79]:
records = []
results = soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    records.append(extract_record(item))

AttributeError: 'NoneType' object has no attribute 'find'

### Error handling

In [84]:
def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag= item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com'+atag.get('href')
    
    # price
    try:
        price_parent = item.find('span','a-price')
        price =price_parent.find('span','a-offscreen').text    
    except AttributeError:
        return
    
    # rank and rating
    try:
        rating = item.i.text
        review_count = item.find('span',{'class':'a-size-base','dir':'auto'}).text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description,price,rating,review_count,url)
    
    return result

In [85]:
records = []
results = soup.find_all('div',{'data-component-type':'s-search-result'})

for item in results:
    record = extract_record(item)
    # check if empty
    if record:
        records.append(record)


In [93]:
records[1]

{'$226.99',
 '4.6 out of 5 stars',
 '847',
 'LG 29WN600-W 29" 21:9 UltraWide WFHD IPS HDR1 0 Monitor with FreeSync, Silver',
 'https://www.amazon.com/LG-29WN600-W-29-21-UltraWide/dp/B0876DBCBX/ref=sr_1_2?crid=3HANHGCT8DU6Q&dchild=1&keywords=ultrawide+monitor&qid=1610172179&sprefix=ultr%2Caps%2C238&sr=8-2'}

In [97]:
for row in records:
    print(row)

{'189', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A06897513SCFNU00XHI0Q&url=%2FLG-34WN650-W-34-Inch-UltraWide-DisplayHDR%2Fdp%2FB087JB656Q%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610172179%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1&qualifier=1610172179&id=6646012076797055&widgetName=sp_atf', '$346.99', '4.6 out of 5 stars', 'LG 34WN650-W 34-Inch 21:9 UltraWide Full HD (2560 x 1080) IPS Display with VESA DisplayHDR 400 and AMD FreeSync, Silver'}
{'847', 'LG 29WN600-W 29" 21:9 UltraWide WFHD IPS HDR1 0 Monitor with FreeSync, Silver', 'https://www.amazon.com/LG-29WN600-W-29-21-UltraWide/dp/B0876DBCBX/ref=sr_1_2?crid=3HANHGCT8DU6Q&dchild=1&keywords=ultrawide+monitor&qid=1610172179&sprefix=ultr%2Caps%2C238&sr=8-2', '4.6 out of 5 stars', '$226.99'}
{'Sceptre 35 Inch Curved UltraWide 21: 9 LED Creative Monitor QHD 3440x1440 Frameless AMD Freesync HDMI DisplayPort

### Getting the next page

In [98]:
def get_url(search_term):
    '''Generate a url from search term'''
    template = 'https://www.amazon.com/s?k={}&crid=3HANHGCT8DU6Q&sprefix=ultr%2Caps%2C238&ref=nb_sb_ss_ts-a-p_1_4'
    search_term = search_term.replace(' ','+')
   
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
    return url

### Putting it all together


In [115]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver


def get_url(search_term):
    '''Generate a url from search term'''
    template = 'https://www.amazon.com/s?k={}&crid=3HANHGCT8DU6Q&sprefix=ultr%2Caps%2C238&ref=nb_sb_ss_ts-a-p_1_4'
    search_term = search_term.replace(' ','+')
   
    # add term query to url
    url = template.format(search_term)
    
    # add page query placeholder
    url += '&page{}'
    return url

def extract_record(item):
    '''Extract and return data from a single record'''
    
    # description and url
    atag= item.h2.a
    description = atag.text.strip()
    url = 'https://www.amazon.com'+atag.get('href')
    
    # price
    try:
        price_parent = item.find('span','a-price')
        price =price_parent.find('span','a-offscreen').text    
    except AttributeError:
        return
    
    # rank and rating
    try:
        rating = item.i.text
        review_count = item.find('span',{'class':'a-size-base','dir':'auto'}).text
    except AttributeError:
        rating = ''
        review_count = ''
        
    result = (description,price,rating,review_count,url)
    
    return result


def main(search_term):
    '''Run main program routine'''
    driver = webdriver.Chrome(executable_path=r"D:\Programs\chromedriver.exe")    
    
    records = []
    url = get_url(search_term)
    
    for page in range(1,14):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div',{'data-component-type':'s-search-result'})
        
        
        for item in results:
            record = extract_record(item)
            print(record)
            if record:
                records.append(record)
    driver.close()
    
    
    ## save data to csv file
    with open('results.csv','w',newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Description','Price','Rating','ReviewCount','Url'])
        writer.writerows(records)
    
    

In [116]:
main('urltrawide monitor')

('LG 34WL60TM-B 34 Inch 21:9 UltraWide 1080p Full HD IPS Monitor (Renewed)', '$279.99', '4.5 out of 5 stars', '30', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02574201I7UKDWD8KI3C&url=%2FLG-34WL60TM-B-34-Inch-UltraWide%2Fdp%2FB084G7ZDGD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183684%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1%26smid%3DAD0BANCB0IN2U&qualifier=1610183684&id=581323281687812&widgetName=sp_atf')
('SAMSUNG LU32R590CWNXZA 32-Inch UR590C UHD 4K Curved Gaming Monitor, Dark Blue Gray', '$419.99', '4.5 out of 5 stars', '693', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0359020268NI7NFZOLPF&url=%2FSamsung-U32R590-Curved-Monitor-LU32R590CWNXZA%2Fdp%2FB07L9G1BFX%2Fref%3Dsr_1_2_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183684%26sprefix%3Dultr%252Caps%252

('LG 34WL60TM-B 34 Inch 21:9 UltraWide 1080p Full HD IPS Monitor (Renewed)', '$279.99', '4.5 out of 5 stars', '30', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02574201I7UKDWD8KI3C&url=%2FLG-34WL60TM-B-34-Inch-UltraWide%2Fdp%2FB084G7ZDGD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183692%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1%26smid%3DAD0BANCB0IN2U&qualifier=1610183692&id=4487895915369351&widgetName=sp_atf')
('SAMSUNG LU32R590CWNXZA 32-Inch UR590C UHD 4K Curved Gaming Monitor, Dark Blue Gray', '$419.99', '4.5 out of 5 stars', '693', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0359020268NI7NFZOLPF&url=%2FSamsung-U32R590-Curved-Monitor-LU32R590CWNXZA%2Fdp%2FB07L9G1BFX%2Fref%3Dsr_1_2_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183692%26sprefix%3Dultr%252Caps%25

('LG 34WL60TM-B 34 Inch 21:9 UltraWide 1080p Full HD IPS Monitor (Renewed)', '$279.99', '4.5 out of 5 stars', '30', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02574201I7UKDWD8KI3C&url=%2FLG-34WL60TM-B-34-Inch-UltraWide%2Fdp%2FB084G7ZDGD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183701%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1%26smid%3DAD0BANCB0IN2U&qualifier=1610183701&id=3510838637299572&widgetName=sp_atf')
('SAMSUNG LU32R590CWNXZA 32-Inch UR590C UHD 4K Curved Gaming Monitor, Dark Blue Gray', '$419.99', '4.5 out of 5 stars', '693', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0359020268NI7NFZOLPF&url=%2FSamsung-U32R590-Curved-Monitor-LU32R590CWNXZA%2Fdp%2FB07L9G1BFX%2Fref%3Dsr_1_2_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183701%26sprefix%3Dultr%252Caps%25

('LG 34WL60TM-B 34 Inch 21:9 UltraWide 1080p Full HD IPS Monitor (Renewed)', '$279.99', '4.5 out of 5 stars', '30', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02574201I7UKDWD8KI3C&url=%2FLG-34WL60TM-B-34-Inch-UltraWide%2Fdp%2FB084G7ZDGD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183707%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1%26smid%3DAD0BANCB0IN2U&qualifier=1610183707&id=8532164450374259&widgetName=sp_atf')
('SAMSUNG LU32R590CWNXZA 32-Inch UR590C UHD 4K Curved Gaming Monitor, Dark Blue Gray', '$419.99', '4.5 out of 5 stars', '693', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0359020268NI7NFZOLPF&url=%2FSamsung-U32R590-Curved-Monitor-LU32R590CWNXZA%2Fdp%2FB07L9G1BFX%2Fref%3Dsr_1_2_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183707%26sprefix%3Dultr%252Caps%25

('LG 34WL60TM-B 34 Inch 21:9 UltraWide 1080p Full HD IPS Monitor (Renewed)', '$279.99', '4.5 out of 5 stars', '30', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02574201I7UKDWD8KI3C&url=%2FLG-34WL60TM-B-34-Inch-UltraWide%2Fdp%2FB084G7ZDGD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183713%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1%26smid%3DAD0BANCB0IN2U&qualifier=1610183713&id=4501380060984416&widgetName=sp_atf')
('SAMSUNG LU32R590CWNXZA 32-Inch UR590C UHD 4K Curved Gaming Monitor, Dark Blue Gray', '$419.99', '4.5 out of 5 stars', '693', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0359020268NI7NFZOLPF&url=%2FSamsung-U32R590-Curved-Monitor-LU32R590CWNXZA%2Fdp%2FB07L9G1BFX%2Fref%3Dsr_1_2_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183713%26sprefix%3Dultr%252Caps%25

('LG 34WL60TM-B 34 Inch 21:9 UltraWide 1080p Full HD IPS Monitor (Renewed)', '$279.99', '4.5 out of 5 stars', '30', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02574201I7UKDWD8KI3C&url=%2FLG-34WL60TM-B-34-Inch-UltraWide%2Fdp%2FB084G7ZDGD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183718%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1%26smid%3DAD0BANCB0IN2U&qualifier=1610183718&id=4598947423285889&widgetName=sp_atf')
('SAMSUNG LU32R590CWNXZA 32-Inch UR590C UHD 4K Curved Gaming Monitor, Dark Blue Gray', '$419.99', '4.5 out of 5 stars', '693', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0359020268NI7NFZOLPF&url=%2FSamsung-U32R590-Curved-Monitor-LU32R590CWNXZA%2Fdp%2FB07L9G1BFX%2Fref%3Dsr_1_2_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183718%26sprefix%3Dultr%252Caps%25

('LG 34WL60TM-B 34 Inch 21:9 UltraWide 1080p Full HD IPS Monitor (Renewed)', '$279.99', '4.5 out of 5 stars', '30', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A02574201I7UKDWD8KI3C&url=%2FLG-34WL60TM-B-34-Inch-UltraWide%2Fdp%2FB084G7ZDGD%2Fref%3Dsr_1_1_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183723%26sprefix%3Dultr%252Caps%252C238%26sr%3D8-1-spons%26psc%3D1%26smid%3DAD0BANCB0IN2U&qualifier=1610183723&id=887069314577220&widgetName=sp_atf')
('SAMSUNG LU32R590CWNXZA 32-Inch UR590C UHD 4K Curved Gaming Monitor, Dark Blue Gray', '$419.99', '4.5 out of 5 stars', '693', 'https://www.amazon.com/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A0359020268NI7NFZOLPF&url=%2FSamsung-U32R590-Curved-Monitor-LU32R590CWNXZA%2Fdp%2FB07L9G1BFX%2Fref%3Dsr_1_2_sspa%3Fcrid%3D3HANHGCT8DU6Q%26dchild%3D1%26keywords%3Dultrawide%2Bmonitor%26qid%3D1610183723%26sprefix%3Dultr%252Caps%252