<a href="https://colab.research.google.com/github/vinismachadoo/web_scrapping_farm/blob/main/catalogo_farm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests, threading
from multiprocessing.pool import ThreadPool, Pool
from bs4 import BeautifulSoup
import pandas as pd

In [26]:
url_base = 'https://www.farmrio.com.br'
subcategorias_farm = []

print('------ Procurando links no site da FARM ------\n')
pagina_inicial = requests.get(url_base)
soup = BeautifulSoup(pagina_inicial.content, 'html.parser')
menu = soup.find_all('a', {'class':'menu__link menu__link--level-3'})
for m in menu:
    if 'http' not in m.get('href'):
        subcategorias_farm.append(url_base+m.get('href'))
    else:
        subcategorias_farm.append(m.get('href'))

print(f'-- Encontradas {len(subcategorias_farm)} subpáginas')

------ Procurando links no site da FARM ------

-- Encontradas 79 subpáginas


In [24]:
def encontra_url_categorias(subcategoria):
    pagina_categoria = requests.get(subcategoria)
    soup = BeautifulSoup(pagina_categoria.text, 'html.parser')
    scripts = soup.find_all('script')
    for script in scripts:
        s = script.string
        if s is not None and 'var pagecount' in s:
            cat_url = s.split("load('")[1].split("' + page")[0]
            lista_url_cartegorias.append(url_base+cat_url)
        else:
            pass

lista_url_cartegorias = []
print('------ Acessando categorias da FARM ------\n')
ThreadPool(15).map(encontra_url_categorias, subcategorias_farm)
print('-- Todas as subcategorias visitadas')
urls_farm = list(dict.fromkeys(lista_url_cartegorias))
print(f'-- Encontradas {len(urls_farm)} subcategorias')

------ Acessando categorias da FARM ------

-- Todas as subcategorias visitadas
-- Encontradas 35 subcategorias


In [25]:
def encontra_skus_farm(url_farm):
    page_number = 1
    while True:
        page = requests.get(f'{url_farm}{page_number}')
        if page.text == '':
            break
        else:
            soup = BeautifulSoup(page.content, 'html.parser')
            skus = soup.find_all("div", {"class":"shelf__product shelf-product js-vitrine-interativa"})
            for s in skus:
                lista_skus.append(s.get('data-product-id'))
        page_number += 1

lista_skus = []
print('------ Pegando SKUS da FARM ------\n')
ThreadPool(20).map(encontra_skus_farm, urls_farm)
print('-- Site da FARM totalmente carregado')
skus_farm = list(dict.fromkeys(lista_skus))
print(f'-- Encontrados {len(skus_farm)} SKUS')

------ Pegando SKUS da FARM ------

-- Site da FARM totalmente carregado
-- Encontrados 1797 SKUS


In [27]:
def raspador_farm(sku):
    try:
        r = requests.get(f'https://www.farmrio.com.br/api/catalog_system/pub/products/search/?fq=productId:{sku}')
        product_data = r.json()[0]
    except:
        pass
    else:
        try:
            id = product_data['productId']
        except:
            id = ''
        try:
            nome = product_data['productName']
        except:
            nome = ''
        try:
            url = product_data['link']
        except:
            url = ''
        try:
            marca = product_data['Marca'][0]
        except:
            marca = ''
        try:
            colecao = product_data['Coleção'][0]
        except:
            colecao = ''
        try:
            composicao = product_data['Composição'][0]
        except:
            composicao = ''
        try:
            estampa = product_data['Nome Estampa'][0]
        except:
            estampa = ''

        general_product_info = {
        'ID': id,
        'Nome': nome,
        'URL': url,
        'Marca': marca,
        'Colecao': colecao,
        'Composicao': composicao,
        'Estampa': estampa,
        }

        for item in product_data['items']:
            try:
                tamanho = item['Tamanho'][0]
            except:
                tamanho = ''
            try:
                ean = item['ean']
            except:
                ean = ''
            try:
                em_falta = item['sellers'][0]['commertialOffer']['GetInfoErrorMessage']
            except:
                em_falta = ''
            try:
                preco_default = item['sellers'][0]['commertialOffer']['ListPrice']
            except:
                preco_default = ''
            try:
                preco_atual = item['sellers'][0]['commertialOffer']['Price']
            except:
                preco_atual = ''
            
            size_product_info = {
                'Tamanho': tamanho,
                'EAN': ean,
                'Em falta': em_falta,
                'Preço default': preco_default,
                'Preco atual': preco_atual
            }

            farm.append({**general_product_info, **size_product_info})

farm = []
print('------ Montando catálogo FARM ------\n')
ThreadPool(100).map(raspador_farm, skus_farm)
print(f'-- Catálogo completo com {len(farm)} EANs')

------ Montando catálogo FARM ------

-- Catálogo completo com 8039 EANs


In [28]:
df_farm = pd.DataFrame(farm)
df_farm

Unnamed: 0,ID,Nome,URL,Marca,Colecao,Composicao,Estampa,Tamanho,EAN,Em falta,Preço default,Preco atual
0,42085,Pochete Padang Royal,https://www.farmrio.com.br/pochete-padang-roya...,Farm,Inverno 2020,Indefinida,,U,2815418016U,,298.0,189.0
1,50470,Bolsa Praiana,https://www.farmrio.com.br/bolsa-praiana-est-n...,Farm,Verao 2021,Indefinida,,U,29363110172U,,129.0,129.0
2,48235,Saia Gode Floral Praiano,https://www.farmrio.com.br/saia-gode-floral-pr...,Farm,Alto Verao 2021,94% Viscose 6% Elastano,FLORAL PRAIANO_AMARELO FL,PP,29066110641PP,Code: withoutStock Status:error Message: O ite...,0.0,0.0
3,48235,Saia Gode Floral Praiano,https://www.farmrio.com.br/saia-gode-floral-pr...,Farm,Alto Verao 2021,94% Viscose 6% Elastano,FLORAL PRAIANO_AMARELO FL,P,29066110641P,,179.0,179.0
4,48235,Saia Gode Floral Praiano,https://www.farmrio.com.br/saia-gode-floral-pr...,Farm,Alto Verao 2021,94% Viscose 6% Elastano,FLORAL PRAIANO_AMARELO FL,M,29066110641M,,179.0,179.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8034,14996,Short Alto Estrela Foil,https://www.farmrio.com.br/short-alto-estrela-...,Farm,Verao 2018,100% Algodão,,34,259743014234,Code: withoutStock Status:error Message: O ite...,0.0,0.0
8035,14996,Short Alto Estrela Foil,https://www.farmrio.com.br/short-alto-estrela-...,Farm,Verao 2018,100% Algodão,,36,259743014236,Code: withoutStock Status:error Message: O ite...,0.0,0.0
8036,14996,Short Alto Estrela Foil,https://www.farmrio.com.br/short-alto-estrela-...,Farm,Verao 2018,100% Algodão,,38,259743014238,Code: withoutStock Status:error Message: O ite...,0.0,0.0
8037,14996,Short Alto Estrela Foil,https://www.farmrio.com.br/short-alto-estrela-...,Farm,Verao 2018,100% Algodão,,40,259743014240,,259.0,129.5
