In [2]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import re
from datetime import datetime
import time
from selenium import webdriver

In [31]:
print('requests version:', requests.__version__)

requests version: 2.22.0


# Web scraping https://

Main search: https://busca.carrefour.com.br/busca?q=smartphone&common_filter[1]=12668&results_per_page=2000&sort=6&page=1

This website's products can be accessed with only a main url and different pages. There are many products listed, but some have prices (available) and others are "unavailable", but you can get the price within the source code.

The best aproach until now was:
* 1- Load the main search page and get 1st soup.
* 2- Load products from soup, store on a list of product_tags (html_chunks), and load next page.
* 3- Process is kept until the last page has a number of products of
* 4- The products infos are extracted from the list and stored in a dataframe.

# General Functions and main code

## Extraction

In [32]:
def get_url(page, results_per_page=1000):
    """This function receives a number (page) and returns an url for the website from carrefour.com"""
    url1 = 'https://busca.carrefour.com.br/busca?q=smartphone&common_filter[1]=12668'
    url2 = f'&results_per_page={results_per_page}&sort=6'
    url3 = f'&page={page}'

    url = url1 + url2 + url3
    
    return url

In [33]:
def get_soup_from_url(url):
    """This function gets an url and returns a bs4 soup"""
    
    headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        }
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.content)
    return soup

In [46]:
def get_products_from_soup(soup, show=False):
    """This function receives a soup  comming from carrefour.com and gets products 'available' and 'unavailable' from it,
    by using different tags/classes. It returns a list containing the products which are pieces of the soup."""

    products_container = soup.find_all('ul', attrs= {'class':"neemu-products-container nm-view-type-grid"})[0]

    products = products_container.find_all('li', attrs= {'class':"nm-product-item nm-no-quantity"})
    products_unavailable = products_container.find_all('li', attrs= {'class':"nm-product-item nm-no-quantity is-unavailable"})
    
    if show:
        print(f'foram encontrados {len(products)} produtos disponíveis e {len(products_unavailable)} indisponíveis')
        print(f'\ntotalizando {len(products)+len(products_unavailable)} produtos.')
    
    return products, products_unavailable

In [56]:
def extraction_from_carrefour(results_per_page=1000, show=False):
    """This function excecutes the main extraction code for getting all products from carrefour.com, as a list of
    'soup pieces' ( bs4 elements or html_chunks), which are html codes with information for each product from carrefour.com.
    It receives nothing and returns a list of products.
    If 'show' = True, it prints amount of available and unavailable products for each page."""
    
    
    #starting conditions
    page, total_available_products, total_unavailable_products = 1, [], []
    
    while True:

        url = get_url(page)
        soup = get_soup_from_url(url)
        
        try:
            products, products_unavailable = get_products_from_soup(soup, show)        
        except:
            print('there are no products on this url, extraction concluded!')
            break

        total_products = products + products_unavailable        
        total_available_products += products
        total_unavailable_products += products_unavailable
        
        #if the amount of products is less than 1000
        if len(total_products) != results_per_page:
            break
        
        page += 1
        
    return total_available_products, total_unavailable_products

## Transform

In [63]:
def get_info_from_product(product, product_available=True):
    """This function receives a list of products (list of bs4 elements) and extracts useful information regarding
    products from a website (here, its carrefour.com). It returns a single-row dataframe with the following data: 
    'title', 'old_price', 'price', 'parcelas' (installments or payment plan), 'image_link', 'product_link'
    and 'produto_disponível' (Yes/No representing if a product is available)"""

    title = product.find_all('a')[0]['title']

    price = product.find_all('span', attrs={'class':"nm-price-value"})[0].text.strip()
    price = int(''.join(re.findall('\d',price)))/100
    
    old_price_list = product.find_all('span', attrs={'class':"nm-old-price-value"})
    try:
        old_price = int(''.join(re.findall('\d',old_price_list[0].text)))/100
    except:
        old_price = price

    parcelas = product.find_all('div', attrs={'class':"nm-installment-container"})[0].text.strip().split(' ')
    parcelas = ''.join(parcelas).replace('\n',' ')

    image_link = product.find_all('img')[0]['src'].split('//')[1]
    product_link = product.find_all('a')[0]['href'].split('//')[1]

    if product_available:       
        produto_disponível = "Yes"
    else:
        produto_disponível = "No"
    
    minidf = pd.DataFrame({'title' : title, 
                           'old_price' : old_price, 
                           'price' : price, 
                           'parcelas' : parcelas, 
                           'image_link' : image_link, 
                           'product_link' : product_link,
                           'produto_disponível' : produto_disponível}, index=[0])
    
    return minidf

In [37]:
def get_info_from_all_products(products, product_available=True):
    """This function is the main function for getting all products information from a website (here, its carrefour.com).
    It receives a list of products (list of bs4 elements) and iterate over each product, and extracts the data using
    another function (get_info_from_product), concatenating all data in a single dataframe, which is then returned"""
    
    results = pd.DataFrame()

    for product in tqdm(products):
        minidf = get_info_from_product(product, product_available)
        results = pd.concat([results,minidf])

    results.reset_index(drop=True, inplace=True)

    return results

## Load

In [1]:
def save_to_csv(dataframe, name):
    """This function takes a dataframe and stores it with a yyyy-mm-dd-hh.csv stamp,
    with cp1252 enconding"""
    t = datetime.now()
    time = f'{str(t.year)}y-{str(t.month)}m-{str(t.day)}d-{str(t.hour)}h'
    dataframe.to_csv('storage/' + name + time + '.csv', encoding='cp1252', index=False)
    print(name + ' saved at: ' + time)

# Main code

In [68]:
print('extracting products')
products, products_unavailable = extraction_from_carrefour()
print(f'extraction concluded: \n{len(products)} available products\n{len(products_unavailable)} unavailable products')
print('getting available products:')
time.sleep(1)
prod_av_res = get_info_from_all_products(products, product_available=True)
print('getting unavailable products:')
time.sleep(1)
prod_un_res = get_info_from_all_products(products_unavailable, product_available=False)
results = pd.concat([prod_av_res, prod_un_res]).reset_index(drop=True)
save_to_csv(results, 'carrefour_raw')

extracting products
extraction concluded: 
727 available products
3448 unavailable products
getting available products:


100%|██████████| 727/727 [00:09<00:00, 75.10it/s]


getting unavailable products:


100%|██████████| 3448/3448 [00:47<00:00, 73.18it/s]


carrefour_raw saved at: 2020y-5m-15d-11h


In [69]:
results.loc[(results.produto_disponível == 'No')]

Unnamed: 0,title,old_price,price,parcelas,image_link,product_link,produto_disponível
727,Iphone 11 256gb Desbloqueado Mwm72bz a Apple P...,9800.00,9800.00,"Até 10xdeR$980,00 sem juros",static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/Iphone-11-256gb-Desbloque...,No
728,iPhone 11 Pro Max 256GB - Prata,8599.00,8599.00,"Até 10xdeR$955,44 sem juros",static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/iPhone-11-Pro-Max-256GB-P...,No
729,"iPhone XS Max Apple Prata, 512GB Desbloqueado ...",9280.98,9280.98,"Até 10xdeR$928,10 sem juros",static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/iPhone-XS-Max-Apple-Prata...,No
730,iPhone 11 Pro 512GB - Dourado,8805.55,8805.55,"Até 10xdeR$926,90 sem juros",static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/iPhone-11-Pro-512GB-Doura...,No
731,Smartphone Xiaomi Mi 9 6.39Pol Ram 6GB 128GB 4...,8942.91,8942.91,"Até 10xdeR$894,29 sem juros",static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/Smartphone-Xiaomi-Mi-9-6-...,No
...,...,...,...,...,...,...,...
4170,Cabo Flex com Conector de Carga iPhone 6 Plus ...,55.00,55.00,em1xnoCartãoouBoleto,static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/Cabo-Flex-com-Conector-de...,No
4171,Cabo Flex com Conector de Carga iPhone 5S Pret...,37.00,37.00,em1xnoCartãoouBoleto,static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/Cabo-Flex-com-Conector-de...,No
4172,Cabo Flex com Conector de Carga iPhone 6 Branco,36.90,36.90,em1xnoCartãoouBoleto,static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/Cabo-Flex-com-Conector-de...,No
4173,SUPORTE UNIVERSAL DE CELULAR SMARTPHONE VEICUL...,28.16,28.16,em1xnoCartãoouBoleto,static.carrefour.com.br/medias/sys_master/imag...,www.carrefour.com.br/SUPORTE-UNIVERSAL-DE-CELU...,No
