# What is web scraping
Web scraping is the process of using bots to extract content and data from a website.

Unlike screen scraping, which only copies pixels displayed onscreen, web scraping extracts underlying HTML code and, with it, data stored in a database. The scraper can then replicate entire website content elsewhere.

Web scraping is used in a variety of digital businesses that rely on data harvesting. Legitimate use cases include:

-Search engine bots crawling a site, analyzing its content and then ranking it.

-Price comparison sites deploying bots to auto-fetch prices and product descriptions for allied seller websites.

-Market research companies using scrapers to pull data from forums and social media (e.g., for sentiment analysis).

In [2]:
import csv
from bs4 import BeautifulSoup
from msedge.selenium_tools import Edge,EdgeOptions

In [84]:
options = EdgeOptions()
options.use_chromium = True
driver= Edge(options=options)

  driver= Edge(options=options)


In [85]:
url ='https://www.amazon.com'
driver.get(url)

In [86]:
def get_url(search_term):
    """generate a url from search term"""
    template='https://www.amazon.com/s?k={}&ref=nb_sb_ss_sc_2_10'
    search_term=search_term.replace(' ','+')
    return template.format(search_term)

In [87]:
url= get_url('ultrawide monitor')
print(url)

https://www.amazon.com/s?k=ultrawide+monitor&ref=nb_sb_ss_sc_2_10


In [88]:
driver.get(url)

In [89]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [90]:
result = soup.find_all('div',{'data-component-type':'s-search-result'})

In [91]:
len(result)

22

# prototype  the record

In [92]:
item =result[0]

In [93]:
atag=item.h2.a

In [94]:
description=atag.text.strip()

In [95]:
url='https://www.amazon.com'+ atag.get('href')

In [96]:
price_parent =item.find('span','a-price')

In [97]:
price=price_parent.find('span','a-offscreen').text

In [98]:
rating = item.i.text

In [99]:
review_count=item.find('span',{'class':'a-size-base s-underline-text'}).text

# generalize the pattern

In [112]:
def extract_record(item):
    #description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='https://www.amazon.com'+ atag.get('href')
    
    #price
    price_parent =item.find('span','a-price')
    price=price_parent.find('span','a-offscreen').text
    
    # rank and rating
    rating = item.i.text
    review_count=item.find('span',{'class':'a-size-base s-underline-text'}).text
    
    result = (description,price,rating,review_count,url)
    
    return result

In [115]:
records=[]
result =soup.find_all('div',{'data-component-type' : 's-search-result'})

for item in result:
    records.append(extract_record(item))

# error handilng

In [114]:
def extract_record(item):
    
    #description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='https://www.amazon.com'+ atag.get('href')
    
    try:
        #price
        price_parent =item.find('span','a-price')
        price=price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rank and rating
        rating = item.i.text
        review_count=item.find('span',{'class':'a-size-base s-underline-text'}).text
    except AttributeError:
        rating=''
        review_count=''
    

    
    
    result = (description,price,rating,review_count,url)
    
    return result

In [121]:
records=[]
result =soup.find_all('div',{'data-component-type' : 's-search-result'})

for item in result:
    record=extract_record(item)
    if record:
        records.append(record)

In [110]:
'''records=[]
result =soup.find_all('div',{'data-component-type' : 's-search-result'})

for item in result:
    records.append(extract_record(item))
    if records:
        records.append(result)'''

In [127]:
for row in records:
    print(row[1])

$336.99
$299.99
$393.75
$289.97
$176.99
$379.99
$449.99
$399.99
$137.93
$336.99
$296.99
$109.73
$389.18
$249.99
$199.99
$115.99
$611.97
$229.87
$169.99
$169.99
$139.99


In [None]:
'''for row in records:
    print(row[0])'''

# geting next page

In [147]:
def get_url(search_term):
    """generate a url from search term"""
    template='https://www.amazon.com/s?k={}&ref=nb_sb_ss_sc_2_10'
    search_term=search_term.replace(' ','+')
    
    #addterm query to url
    url=template.format(search_term)
    
    #add page query placeholder
    
    url +='&page{}'
    
    return url

# putting it all together

In [176]:
import csv
from bs4 import BeautifulSoup
from msedge.selenium_tools import Edge,EdgeOptions

def get_url(search_term):
    """generate a url from search term"""
    template='https://www.amazon.com/s?k={}&ref=nb_sb_ss_sc_2_10'
    search_term=search_term.replace(' ','+')
    
    #addterm query to url
    url=template.format(search_term)
    
    #add page query placeholder
    
    url +='&page{}'
    
    return url
def extract_record(item):
    #description and url
    atag=item.h2.a
    description=atag.text.strip()
    url='https://www.amazon.com'+ atag.get('href')
    
    try:
        #price
        price_parent =item.find('span','a-price')
        price=price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
         # rank and rating
        rating = item.i.text
        review_count=item.find('span',{'class':'a-size-base s-underline-text'}).text
    except AttributeError:
        rating=''
        review_count=''
    

    
    
    result = (description,price,rating,review_count,url)
    
    return result

def main(search_term):
    #start the webdriver
    options = EdgeOptions()
    options.use_chromium = True
    driver= Edge(options=options)
    
    record=[]
    url=get_url(search_term)
    
    for page in range(1,21):
        driver.get(url.format(page))
        soup = BeautifulSoup(driver.page_source,'html.parser')
        results =soup.find_all('div',{'data-component-type' : 's-search-result'})
        
        for item in results:
            record=extract_record(item)
            if record:
                records.append(record)
                
    driver.close()
    
    #save the data to csv
    with open('ultrawide_monitor_data.csv','w',newline='',encoding='utf-8') as f:
        writer =csv.writer(f)
        writer.writerow(['Description','Price','Rating','ReviewCount','Url'])
        writer.writerows(records)
        

In [177]:
for row in records:
    print(row[2])

4.6 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.2 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.3 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.4 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.2 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars
4.5 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.5 out of 5 stars
4.1 out of 5 stars
4.4 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5

4.2 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.3 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars
4.5 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.6 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.2 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.3 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars
4.5 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.6 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.2 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.3 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars
4.5 out of 5

4.6 out of 5 stars
4.6 out of 5 stars
4.2 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars
4.5 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.6 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.2 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars
4.5 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.6 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.4 out of 5 stars
4.5 out of 5 stars
4.4 out of 5 stars
4.1 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.6 out of 5 stars
4.2 out of 5 stars
4.7 out of 5 stars
4.7 out of 5 stars
4.5 out of 5 stars
4.3 out of 5 stars
4.5 out of 5 stars
4.6 out of 5 stars
4.5 out of 5 stars
4.5 out of 5

In [178]:
main('ultrawide monitor')

  driver= Edge(options=options)
