# Scrapping Amazon with BeautifulSoup

In this project I scraped the Haircare Products section of Amazon, after typing "revlon professional" into the search bar. 

## Extracting the first page

### Importing libraries and data

In [20]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [21]:
# defining the url with the search already included in it.
url = 'https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid=1617263751&rnid=6198054031&ref=sr_nr_n_3'

In [4]:
# defining headers in order to avoid being blocked by Amazon
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

In [5]:
# obtaining the response object
resp = requests.get(url, headers=headers)

In [6]:
# checking that there are no errors
resp.status_code

200

In [7]:
# obtaining the html from the response object 
html = resp.text

In [8]:
# converting it into a BeautifulSoup object
soup = BeautifulSoup(html, 'html.parser')

In [9]:
# finding all the products in the page by class
products = soup.findAll('div', attrs={"class":"sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20"})

### Looking for the fields we want to scrape and formatting them 

In [10]:
# Name of the product as a str:
products[1].find('span', attrs={'class':'a-size-base-plus a-color-base a-text-normal'}).text

'UniqOne Revlon Professional - Tratamiento para el cabello, Coco, 150 ml'

In [11]:
# Price of the product as float:
float(products[1].find('span', attrs={'class':'a-price-whole'}).text.replace(',', '.'))

6.95

In [12]:
# rating (stars) as float:
float(products[1].find('div', attrs={'class':'a-row a-size-small'}).text.split(' de')[0].replace(',', '.'))

4.5

In [13]:
# number of reviews as int:
int(products[1].find('div', attrs={'class':'a-row a-size-small'}).text.split('estrellas')[1].replace('.', '').strip())

8702

In [14]:
# delivery date as str:
products[1].find('div', attrs={'class':'a-row s-align-children-center'}).text.split(', ')[1].replace('\n', '')

'6 de abril'

In [15]:
# price per 100ml as float: 
float(products[1].find('span', attrs={'class':'a-size-base a-color-secondary'}).text.split('\xa0€/100 ml)')[0].replace('(', '').replace(',', '.'))

4.63

In [16]:
# old price
float(products[1].find('span', attrs={'class':'a-price a-text-price'}).text.split('\xa0€')[0].replace(',', '.'))

11.66

### Extracting the chosen fields for all the products 

In [17]:
# we extract all the fiels in all products with a for loop. 
# since in some cases the fields are empty, we have to use try/and except to avoid getting an error and stopping the loop. 

data = []
for product in products: 
    try:
        prod_name = product.find('span', attrs={'class':'a-size-base-plus a-color-base a-text-normal'}).text
    except:
        prod_name = None
    try:
        price = float(product.find('span', attrs={'class':'a-price-whole'}).text.replace(',', '.'))
    except:
        price = None
    try: 
        price_ml = float(product.find('span', attrs={'class':'a-size-base a-color-secondary'}).text.split('\xa0€/100 ml)')[0].replace('(', '').replace(',', '.'))
    except:
        price_ml = None
    try:
        old_price = float(product.find('span', attrs={'class':'a-price a-text-price'}).text.split('\xa0€')[0].replace(',', '.'))
    except:
        old_price = None
    try:
        stars = float(product.find('div', attrs={'class':'a-row a-size-small'}).text.split(' de')[0].replace(',', '.'))
    except:
        stars = None
    try:
        reviews = int(product.find('div', attrs={'class':'a-row a-size-small'}).text.split('estrellas')[1].replace('.', '').strip())
    except:
        reviews = None
    try:
        delivery_date = product.find('div', attrs={'class':'a-row s-align-children-center'}).text.split(', ')[1].replace('\n', '')
    except:
        delivery_date = None
    try: 
        data.append({'prod_name':prod_name, 'price':price, 'old_price':old_price, 'price_ml':price_ml,'stars':stars, 'reviews':reviews, 'deliver_date':delivery_date})
    except:
        print('error')

In [18]:
# converting the extracted data into a pandas DataFrame to check if it's good
df = pd.DataFrame(data)

In [22]:
df.head()

Unnamed: 0,prod_name,price,old_price,price_ml,stars,reviews,deliver_date
0,Revlon Professional UniqOne Champú y Acondicio...,,,,4.5,3003.0,
1,UniqOne Revlon Professional - Tratamiento para...,6.95,11.66,4.63,4.5,8702.0,6 de abril
2,UniqOne Revlon Professional Classico Tratamien...,18.61,,,4.5,70.0,9 de abril
3,Revlon Professional ProYou Textura de Peinado ...,12.99,,,4.0,71.0,6 de abril
4,REVLON PROFESSIONAL Nutri Color Filters #400 T...,11.71,15.0,,4.6,80.0,6 de abril


## Extracting all the pages

### Generating the URL for each page

If we look at the url of each page, we will see that the first url is unique. After the second page and until the 17th, they are all the same with just 3 changes: 
https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&page={number}&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid={qid}&rnid=6198054031&ref=sr_pg_{number}

The page number appears twice (in the middle and at the end). The qid is a number that starts at 1617271529 and increases by 35 in each page. 

With this in mind, we can create our URLs.

In [65]:
# We create the list where we will append the URLs. 
# We include the first page URL because it is unique.
url_list = ['https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid=1617263751&rnid=6198054031&ref=sr_nr_n_3']

In [66]:
# Writing the URL od page 2 to have it as a template
url_p2 = 'https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&page=2&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid=1617271529&rnid=6198054031&ref=sr_pg_2'

In [67]:
# Generating a list of qids starting at 1617271529 and increasing by 35. 
page_qids = list(range(1617271529, 1617272089, 35))

# Generating page numbers from 2 to 17.
page_numbers = list(range(2, 18))

In [133]:
#Creating the URLs using the elements in the two lists we previously defined
for qid, number in zip(page_ids, page_numbers): 
    url_list.append(f'https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&page={number}&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid={qid}&rnid=6198054031&ref=sr_pg_{number}')      

### Parsing all the pages

In [69]:
# Creating headers with a User agent to avoid being blocked by Amazon
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

In [121]:
# Creating th empty list where we will put the parsed data of all pages
parsed_pages = []

In [122]:
# Repeating the process seen in the first part of the notebook, this time for every page
for url in url_list:
    resp = requests.get(url, headers=headers)
    html = resp.text
    soup = BeautifulSoup(html, 'html.parser')
    products = soup.findAll('div', attrs={"class":"sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20"})
    parsed_pages.append(products) #and appending it to the list parsed_pages

### Extracting the chosen fields for all the products 

In [123]:
# We create an empty list to append the dictionaries we will create from the data in parsed_pages
data2 = []

In [124]:
# We run the same loop as before, but with an outer loop to iterate over the pages in parsed_pages

for page in parsed_pages:
    for product in page:
        try:
            prod_name = product.find('span', attrs={'class':'a-size-base-plus a-color-base a-text-normal'}).text
        except:
            prod_name = None
        try:
            price = float(product.find('span', attrs={'class':'a-price-whole'}).text.replace(',', '.'))
        except:
            price = None
        try: 
            price_ml = float(product.find('span', attrs={'class':'a-size-base a-color-secondary'}).text.split('\xa0€/100 ml)')[0].replace('(', '').replace(',', '.'))
        except:
            price_ml = None
        try:
            old_price = float(product.find('span', attrs={'class':'a-price a-text-price'}).text.split('\xa0€')[0].replace(',', '.'))
        except:
            old_price = None
        try:
            stars = float(product.find('div', attrs={'class':'a-row a-size-small'}).text.split(' de')[0].replace(',', '.'))
        except:
            stars = None
        try:
            reviews = int(product.find('div', attrs={'class':'a-row a-size-small'}).text.split('estrellas')[1].replace('.', '').strip())
        except:
            reviews = None
        try:
            delivery_date = product.find('div', attrs={'class':'a-row s-align-children-center'}).text.split(', ')[1].replace('\n', '')
        except:
            delivery_date = None
        try: 
            data2.append({'prod_name':prod_name, 'price':price, 'old_price':old_price, 'price_ml':price_ml,'stars':stars, 'reviews':reviews, 'delivery_date':delivery_date})
        except:
            print('error')

### Creating a Pandas DataFrame and exporting it as a csv

In [126]:
# We create the dataframe from the list of dictionaries that resulter from the loop
df2 = pd.DataFrame(data2)

In [127]:
# Checking there are no errors
df2.head()

Unnamed: 0,prod_name,price,old_price,price_ml,stars,reviews,delivery_date
0,Revlon Professional UniqOne Champú y Acondicio...,,,,4.5,3003.0,
1,UniqOne Revlon Professional - Tratamiento para...,6.95,11.66,4.63,4.5,8705.0,6 de abril
2,UniqOne Revlon Professional Classico Tratamien...,14.95,15.6,,4.5,70.0,
3,Revlon Professional ProYou Textura de Peinado ...,12.99,,,4.0,71.0,6 de abril
4,REVLON PROFESSIONAL Nutri Color Filters #400 T...,11.71,15.0,,4.6,80.0,6 de abril


In [128]:
#Checking there are as many rows as products in the amazon section
df2.shape

(796, 7)

In [132]:
#Exporting it as a csv
df2.to_csv('revlon_pro_amazon.csv')