### Creando un ejemplo completo de un producto

In [1]:
import requests
import bs4 
import pandas as pd
from bs4 import BeautifulSoup
import requests

def get_title(soup):
	try:
		title = soup.find("span", attrs={"id":'productTitle'})
		title_value = title.string
		title_string = title_value.strip()
	except AttributeError:
		title_string = ""	
	return title_string

def get_price(soup):
	try:
		price = soup.find("span", attrs={'class':'a-offscreen'}).text.replace(',', '')
	except AttributeError:
		price = ""	
	return price

def get_rating(soup):
	try:
		rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
	except AttributeError:
		try:
			rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
		except:
			rating = ""	
	return rating

def get_review_count(soup):
	try:
		review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
	except AttributeError:
		review_count = ""	
	return review_count

def get_availability(soup):
	try:
		available = soup.find("div", attrs={'id':'availability'})
		available = available.find("span").string.strip()
	except AttributeError:
		available = ""	
	return available	

if __name__ == '__main__':

	HEADERS = ({'User-Agent':
	            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
	            'Accept-Language': 'en-US, en;q=0.5'})

	URL = "https://www.amazon.com/ZOTAC-Graphics-IceStorm-Advanced-ZT-A30900J-10P/dp/B08ZL6XD9H/"
	webpage = requests.get(URL, headers=HEADERS)

	soup = BeautifulSoup(webpage.content, "lxml")

	print("Product Title =", get_title(soup))
	print("Product Price =", get_price(soup))
	print("Product Rating =", get_rating(soup))
	print("Number of Product Reviews =", get_review_count(soup))
	print("Availability =", get_availability(soup))
	print()
	print()

Product Title = 
Product Price = 
Product Rating = 
Number of Product Reviews = 
Availability = 




### Caso real: Webscrapping de una Auriculares inalámbricos de Amazon


In [2]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import time
import csv
from bs4 import BeautifulSoup

def get_title(driver):
    try:
        title = driver.find_element(By.ID, 'productTitle').text.strip()
    except:
        return ""
    return title

def get_price(driver):
    try:
        # Obtener el contenido HTML de la página actual
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Buscar el precio utilizando un selector CSS adecuado
        product_price = soup.find("span", {"class": "a-price"})
        
        if product_price:
            # Extraer el texto del precio
            price = product_price.find("span").text.strip()
        else:
            price = None
        
    except Exception as e:
        print(f"Error: {e}")
        price = None
    
    return price

def get_rating(driver):
    try:
        # Obtener el contenido HTML de la página actual
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # Buscar el elemento que contiene la calificación del producto
        product_rating = soup.find("i", {"class": "a-icon-star"})
        
        if product_rating:
            # Extraer el texto de la calificación
            rating = product_rating.text.strip()
        else:
            rating = None
        
    except Exception as e:
        print(f"Error: {e}")
        rating = None
    
    return rating
   

def get_review_count(driver):
    try:
        review_count = driver.find_element(By.ID, 'acrCustomerReviewText').text.strip()
    except:
        return ""
    return review_count

def get_availability(driver):
    try:
        available = driver.find_element(By.ID, 'availability').find_element(By.TAG_NAME, 'span').text.strip()
    except:
        return "Not Available"
    return available

def main():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36')

    driver_path = r'C:\Users\Alberto\Desktop\upgrade\mi_entorno\Scripts\chromedriver.exe'
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=options)

    URL = input("Enter the Amazon Product Url- ")
    driver.get(URL)
    time.sleep(3)  # Wait for the page to fully load

    links = driver.find_elements(By.CSS_SELECTOR, 'a.a-link-normal.s-no-outline')
    links_list = [link.get_attribute('href') for link in links]

    with open('amazon_product.csv', 'w', encoding="utf-8-sig") as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Title', 'Price', 'Rating', 'ReviewCount', 'Availability'])

        for link in links_list:
            driver.get(link)
            time.sleep(3)  # Wait for the page to fully load

            title = get_title(driver)
            price = get_price(driver)
            rating = get_rating(driver)
            review_count = get_review_count(driver)
            availability = get_availability(driver)

            print("Product Title =", title)
            print("Product Price =", price)
            print("Product Rating =", rating)
            print("Number of Product Reviews =", review_count)
            print("Availability =", availability)
            print('\n')

            # Write product information to CSV file
            csv_writer.writerow([title, price, rating, review_count, availability])

    driver.quit()

if __name__ == '__main__':
    main()


Product Title = Auriculares Inalámbricos Bluetooth 5.3, Auriculares Bluetooth HiFi Estéreo, Incorporado ENCCancelación de Ruido 42H Cascos Inalámbricos Control Táctil Pantalla LED, Auriculares IP6 Impermeables Blake
Product Price = 29,99€
Product Rating = 4,7 de 5 estrellas
Number of Product Reviews = 2.837 valoraciones
Availability = En stock


Product Title = QYCAITEK Auriculares Inalambricos Bluetooth Estéreo 5.3, 50 Horas de Reproducción con Pantalla LED, Cascos Inalambricos Bluetooth con 4 Mics, Auriculares Bluetooth Impermeables IP7 para Deporte
Product Price = 29,99€
Product Rating = 5,0 de 5 estrellas
Number of Product Reviews = 318 valoraciones
Availability = En stock


Product Title = Tukio Auriculares Inalámbricos Bluetooth, 2024 Auriculares Bluetooth 5.3 con 4 Mic,Mini Auriculares Inalambricos con LED Pantalla, 40 Horas de Reproducción, IPX7 Impermeable Cascos Inalambricos, Negro
Product Price = 16,99€
Product Rating = 4,7 de 5 estrellas
Number of Product Reviews = 611 valo