In [1]:
import csv
from bs4 import BeautifulSoup
from msedge.selenium_tools import Edge, EdgeOptions

## Startup web driver

In [2]:
options = EdgeOptions()
options.use_chromium = True
driver = Edge(options=options)

In [87]:
url = 'https://www.amazon.com'
driver.get(url)

In [88]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    
    return template.format(search_term)    

In [89]:
url = get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&ref=nb_sb_noss_1


In [90]:
driver.get(url)

## Extract the collection

In [91]:
soup = BeautifulSoup(driver.page_source,'html.parser')

In [92]:
results = soup.find_all('div', {'data-component-type' : 's-search-result'})

In [93]:
len(results)

22

In [94]:
item = results[0]
print(item)

In [95]:
atag = item.h2.a

In [96]:
Name = atag.text.strip()
print(Name)

LG 29WN600-W 29" 21:9 UltraWide WFHD IPS HDR1 0 Monitor with FreeSync, Silver


In [97]:
atag.get('href')

'/LG-29WN600-W-29-21-UltraWide/dp/B0876DBCBX/ref=sr_1_1?dchild=1&keywords=ultrawide+monitor&qid=1609260204&sr=8-1'

In [98]:
url = 'https://www.amazon.com' + atag.get('href')
print(url)

https://www.amazon.com/LG-29WN600-W-29-21-UltraWide/dp/B0876DBCBX/ref=sr_1_1?dchild=1&keywords=ultrawide+monitor&qid=1609260204&sr=8-1


In [99]:
price_parent = item.find('span','a-price')

In [100]:
price = price_parent.find('span','a-offscreen').text
print(price)

$226.99


In [101]:
rating = item.i.text
print(rating)

4.6 out of 5 stars


In [102]:
reviews = item.find('span', {'class' : 'a-size-base' , 'dir' : 'auto'}).text
print(reviews)

757


## Generalise the pattern

In [116]:
def extract_record(item):
    
    #description and url
    
    atag = item.h2.a
    Name = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    
    try:
        #price
        price_parent = item.find('span','a-price')
        price = price_parent.find('span','a-offscreen').text
        
    except AttributeError:
        return 
        
    
    try:
        #rating and reviews
        rating = item.i.text
        reviews = item.find('span', {'class' : 'a-size-base' , 'dir' : 'auto'}).text
    
    except AttributeError:
        rating = ' '
        reviews = ' '
    
    result = (Name, price, rating, reviews, url)
    
    return result

In [127]:
records = []

results = soup.find_all('div', {'data-component-type' : 's-search-result'})
print(len(results))

for item in results:
    record = extract_record(item)
    
    if record:
        records.append(record)

22


In [128]:
records[0]

('LG 29WN600-W 29" 21:9 UltraWide WFHD IPS HDR1 0 Monitor with FreeSync, Silver',
 '$226.99',
 '4.6 out of 5 stars',
 '757',
 'https://www.amazon.com/LG-29WN600-W-29-21-UltraWide/dp/B0876DBCBX/ref=sr_1_1?dchild=1&keywords=ultrawide+monitor&qid=1609260204&sr=8-1')

In [131]:
for row in records:
    print(row[1])

$226.99
$379.97
$549.99
$799.99
$195.99
$278.99
$622.99
$56.88
$339.46
$149.99
$396.99
$336.31
$199.99
$449.99
$346.99
$449.99
$16.89
$569.53
$29.99


## Getting next pages

In [132]:
def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
    search_term = search_term.replace(' ','+')
    url = template.format(search_term)
    url += '&page{}'
    
    return url 

# Compiling everything

In [141]:
import csv
from bs4 import BeautifulSoup
from msedge.selenium_tools import Edge, EdgeOptions


def get_url(search_term):
    template = 'https://www.amazon.com/s?k={}'
    search_term = search_term.replace(' ','+')
    url = template.format(search_term)
    url += '&page={}'
    
    return url 

def extract_record(item):
    
    #description and url
    
    atag = item.h2.a
    Name = atag.text.strip()
    url = 'https://www.amazon.com' + atag.get('href')
    
    
    try:
        #price
        price_parent = item.find('span','a-price')
        price = price_parent.find('span','a-offscreen').text
        
    except AttributeError:
        return 
        
    
    try:
        #rating and reviews
        rating = item.i.text
        reviews = item.find('span', {'class' : 'a-size-base' , 'dir' : 'auto'}).text
    
    except AttributeError:
        rating = ' '
        reviews = ' '
    
    result = (Name, price, rating, reviews, url)
    
    return result

def main(search_term):
    
    options = EdgeOptions()
    options.use_chromium = True
    driver = Edge(options=options)
    
    records = []
    url = get_url(search_term)
    
    for page in range(1,21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source,'html.parser')
        results = soup.find_all('div', {'data-component-type' : 's-search-result'})
        
        for item in results:
            record = extract_record(item)

            if record:
                records.append(record)

            
    driver.close()
    
    
    ## save to csv
    
    import pandas as pd
    df = pd.DataFrame(records)  
    df.to_csv('amazon01.csv')  

In [None]:
main('ultrawide monitor')