## Extracción de la primera página

### Importamos librerías y datos

In [20]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [21]:
url = 'https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid=1617263751&rnid=6198054031&ref=sr_nr_n_3'

In [4]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

In [5]:
resp = requests.get(url, headers=headers)

In [6]:
resp.status_code

200

In [7]:
html = resp.text

In [8]:
soup = BeautifulSoup(html, 'html.parser')

In [9]:
products = soup.findAll('div', attrs={"class":"sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20"})

### Buscamos los campos que queremos extraer: 

In [10]:
#nombre producto en str:
products[1].find('span', attrs={'class':'a-size-base-plus a-color-base a-text-normal'}).text

'UniqOne Revlon Professional - Tratamiento para el cabello, Coco, 150 ml'

In [11]:
#precio en float:
float(products[1].find('span', attrs={'class':'a-price-whole'}).text.replace(',', '.'))

6.95

In [12]:
# estrellas en float:
float(products[1].find('div', attrs={'class':'a-row a-size-small'}).text.split(' de')[0].replace(',', '.'))

4.5

In [13]:
# nº reviews en int:
int(products[1].find('div', attrs={'class':'a-row a-size-small'}).text.split('estrellas')[1].replace('.', '').strip())

8702

In [14]:
# fecha delivery en str:
products[1].find('div', attrs={'class':'a-row s-align-children-center'}).text.split(', ')[1].replace('\n', '')

'6 de abril'

In [15]:
#precio por 100 ml en float: 
float(products[1].find('span', attrs={'class':'a-size-base a-color-secondary'}).text.split('\xa0€/100 ml)')[0].replace('(', '').replace(',', '.'))

4.63

In [16]:
#precio tachado
float(products[1].find('span', attrs={'class':'a-price a-text-price'}).text.split('\xa0€')[0].replace(',', '.'))

11.66

### Extraemos los datos de toda la página y creamos un DataFrame 

In [17]:
data = []
for product in products: 
    try:
        prod_name = product.find('span', attrs={'class':'a-size-base-plus a-color-base a-text-normal'}).text
    except:
        prod_name = None
    try:
        price = float(product.find('span', attrs={'class':'a-price-whole'}).text.replace(',', '.'))
    except:
        price = None
    try: 
        price_ml = float(product.find('span', attrs={'class':'a-size-base a-color-secondary'}).text.split('\xa0€/100 ml)')[0].replace('(', '').replace(',', '.'))
    except:
        price_ml = None
    try:
        old_price = float(product.find('span', attrs={'class':'a-price a-text-price'}).text.split('\xa0€')[0].replace(',', '.'))
    except:
        old_price = None
    try:
        stars = float(product.find('div', attrs={'class':'a-row a-size-small'}).text.split(' de')[0].replace(',', '.'))
    except:
        stars = None
    try:
        reviews = int(product.find('div', attrs={'class':'a-row a-size-small'}).text.split('estrellas')[1].replace('.', '').strip())
    except:
        reviews = None
    try:
        delivery_date = product.find('div', attrs={'class':'a-row s-align-children-center'}).text.split(', ')[1].replace('\n', '')
    except:
        delivery_date = None
    try: 
        data.append({'prod_name':prod_name, 'price':price, 'old_price':old_price, 'price_ml':price_ml,'stars':stars, 'reviews':reviews, 'deliver_date':delivery_date})
    except:
        print('error')

In [18]:
df = pd.DataFrame(data)

In [22]:
df.head()

Unnamed: 0,prod_name,price,old_price,price_ml,stars,reviews,deliver_date
0,Revlon Professional UniqOne Champú y Acondicio...,,,,4.5,3003.0,
1,UniqOne Revlon Professional - Tratamiento para...,6.95,11.66,4.63,4.5,8702.0,6 de abril
2,UniqOne Revlon Professional Classico Tratamien...,18.61,,,4.5,70.0,9 de abril
3,Revlon Professional ProYou Textura de Peinado ...,12.99,,,4.0,71.0,6 de abril
4,REVLON PROFESSIONAL Nutri Color Filters #400 T...,11.71,15.0,,4.6,80.0,6 de abril


## Extracción de todas las páginas

### Generamos las URLs

In [65]:
url_list = ['https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid=1617263751&rnid=6198054031&ref=sr_nr_n_3']

In [66]:
url_p2 = 'https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&page=2&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid=1617271529&rnid=6198054031&ref=sr_pg_2'

In [67]:
page_qids = list(range(1617271529, 1617272089, 35))
page_numbers = list(range(2, 18))

In [133]:
for qid, number in zip(page_ids, page_numbers): 
    url_list.append(f'https://www.amazon.es/s?k=revlon+professional&i=beauty&rh=n%3A6198054031%2Cn%3A4347698031%2Cn%3A4347699031&dc&page={number}&__mk_es_ES=%C3%85M%C3%85%C5%BD%C3%95%C3%91&qid={qid}&rnid=6198054031&ref=sr_pg_{number}')      

### Parseamos todas las páginas

In [69]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

In [121]:
parsed_pages = []

In [122]:
for url in url_list:
    resp = requests.get(url, headers=headers)
    html = resp.text
    soup = BeautifulSoup(html, 'html.parser')
    products = soup.findAll('div', attrs={"class":"sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20"})
    parsed_pages.append(products)

### Obtenemos los datos que queremos con BeautifulSoup

In [123]:
data2 = []

In [124]:
for page in parsed_pages:
    for product in page:
        try:
            prod_name = product.find('span', attrs={'class':'a-size-base-plus a-color-base a-text-normal'}).text
        except:
            prod_name = None
        try:
            price = float(product.find('span', attrs={'class':'a-price-whole'}).text.replace(',', '.'))
        except:
            price = None
        try: 
            price_ml = float(product.find('span', attrs={'class':'a-size-base a-color-secondary'}).text.split('\xa0€/100 ml)')[0].replace('(', '').replace(',', '.'))
        except:
            price_ml = None
        try:
            old_price = float(product.find('span', attrs={'class':'a-price a-text-price'}).text.split('\xa0€')[0].replace(',', '.'))
        except:
            old_price = None
        try:
            stars = float(product.find('div', attrs={'class':'a-row a-size-small'}).text.split(' de')[0].replace(',', '.'))
        except:
            stars = None
        try:
            reviews = int(product.find('div', attrs={'class':'a-row a-size-small'}).text.split('estrellas')[1].replace('.', '').strip())
        except:
            reviews = None
        try:
            delivery_date = product.find('div', attrs={'class':'a-row s-align-children-center'}).text.split(', ')[1].replace('\n', '')
        except:
            delivery_date = None
        try: 
            data2.append({'prod_name':prod_name, 'price':price, 'old_price':old_price, 'price_ml':price_ml,'stars':stars, 'reviews':reviews, 'delivery_date':delivery_date})
        except:
            print('error')

### Creamos un DataFrame y lo exportamos como csv

In [126]:
df2 = pd.DataFrame(data2)

In [127]:
df2.head()

Unnamed: 0,prod_name,price,old_price,price_ml,stars,reviews,delivery_date
0,Revlon Professional UniqOne Champú y Acondicio...,,,,4.5,3003.0,
1,UniqOne Revlon Professional - Tratamiento para...,6.95,11.66,4.63,4.5,8705.0,6 de abril
2,UniqOne Revlon Professional Classico Tratamien...,14.95,15.6,,4.5,70.0,
3,Revlon Professional ProYou Textura de Peinado ...,12.99,,,4.0,71.0,6 de abril
4,REVLON PROFESSIONAL Nutri Color Filters #400 T...,11.71,15.0,,4.6,80.0,6 de abril


In [128]:
df2.shape

(796, 7)

In [132]:
df2.to_csv('revlon_pro_amazon.csv')