# Coleta de Dados com o Selenium
    Coletar dados de 250 séries com as maiores avaliações do site `https://www.imdb.com/pt/

In [5]:
import logging
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuração dos logs
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S"
)

def scrape_serie(serie_data):
    """Abre um navegador separado e coleta dados de uma série"""
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)

    try:
        driver.get(serie_data["link"])

        popularidade = wait.until(
            EC.presence_of_element_located((
                By.XPATH,
                '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[3]/a/span/div/div[2]/div[1]'
            ))
        ).text

        elenco_section = wait.until(
            EC.presence_of_all_elements_located((
                By.XPATH,
                '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[5]'
            ))
        )
        elenco_principal = [ator.text for ator in elenco_section]

        return {
            "info": serie_data["info"],
            "link": serie_data["link"],
            "popularidade": popularidade,
            "elenco_principal": elenco_principal
        }

    except Exception as e:
        logging.error(f"Erro em {serie_data['link']}: {e}")
        return None
    finally:
        driver.quit()


# ----------------------
# COLETA DOS LINKS
# ----------------------
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

driver.get("https://www.imdb.com/pt/")

menu_icon = wait.until(
    EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]'))
)
menu_icon.click()

link_series_bem_avaliadas = wait.until(
    EC.element_to_be_clickable((
        By.XPATH,
        '//*[@id="imdbHeader"]/div/aside[1]/div/div[2]/div/div[2]/div[1]/span/div/div/ul/a[2]'
    ))
)
link_series_bem_avaliadas.click()

series = wait.until(
    EC.presence_of_all_elements_located((
        By.XPATH,
        '//*[@id="__next"]/main/div/div[3]/section/div/div[2]/div/ul/li'
    ))
)

series_data = []
for serie in series:
    try:
        info = serie.text.split("\n")
        link = serie.find_element(By.TAG_NAME, 'a').get_attribute('href')
        series_data.append({"info": info, "link": link})
    except Exception as e:
        logging.warning(f"Erro ao extrair série: {e}")

driver.quit()

# ----------------------
# COLETA EM PARALELO
# ----------------------
logging.info(f"Encontradas {len(series_data)} séries. Iniciando coleta em paralelo...")

start_time = time.time()
dados_completos = []
total = len(series_data)
coletadas = 0

with ThreadPoolExecutor(max_workers=5) as executor:  # até 5 navegadores em paralelo
    futures = [executor.submit(scrape_serie, s) for s in series_data]
    for future in as_completed(futures):
        result = future.result()
        coletadas += 1
        if result:
            dados_completos.append(result)
            logging.info(
                f"Coletada: {result['info'][0]} "
                f"({coletadas}/{total} - {coletadas/total*100:.1f}%)"
            )
        else:
            logging.warning(f"Série falhou ({coletadas}/{total} - {coletadas/total*100:.1f}%)")

end_time = time.time()
logging.info(f"Coleta finalizada em {end_time - start_time:.2f} segundos.")

print(dados_completos)


AttributeError: 'NoneType' object has no attribute 'is_displayed'

# Teste com BeautifulSoup

In [4]:
import asyncio
import aiohttp
import logging
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S")

# ---------------------- PEGAR LINKS COM SELENIUM ----------------------
def coletar_links():
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)

    driver.get("https://www.imdb.com/pt/")

    menu_icon = wait.until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]'))
    )
    menu_icon.click()

    link_series_bem_avaliadas = wait.until(
        EC.element_to_be_clickable((
            By.XPATH,
            '//*[@id="imdbHeader"]/div/aside[1]/div/div[2]/div/div[2]/div[1]/span/div/div/ul/a[2]'
        ))
    )
    link_series_bem_avaliadas.click()

    series = wait.until(
        EC.presence_of_all_elements_located((
            By.XPATH,
            '//*[@id="__next"]/main/div/div[3]/section/div/div[2]/div/ul/li'
        ))
    )

    series_data = []
    for serie in series:
        try:
            info = serie.text.split("\n")
            link = serie.find_element(By.TAG_NAME, 'a').get_attribute('href')
            series_data.append({"info": info, "link": link})
        except Exception as e:
            logging.warning(f"Erro ao extrair série: {e}")

    driver.quit()
    return series_data

# ---------------------- FUNÇÃO ASSÍNCRONA DE COLETA ----------------------
async def fetch_and_parse(session, serie_data, idx, total):
    try:
        async with session.get(serie_data["link"]) as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")

            # Popularidade (tentando pegar o equivalente do XPath usado no Selenium)
            popularidade_tag = soup.select_one('a[href*="ratings"] span')
            popularidade = popularidade_tag.get_text(strip=True) if popularidade_tag else "N/A"

            # Elenco principal
            elenco_section = soup.select("section[data-testid='title-cast'] li.ipc-inline-list__item")
            elenco_principal = [ator.get_text(" ", strip=True) for ator in elenco_section]

            logging.info(f"Coletada {idx}/{total} - {serie_data['info'][0]}")

            return {
                "info": serie_data["info"],
                "link": serie_data["link"],
                "popularidade": popularidade,
                "elenco_principal": elenco_principal
            }
    except Exception as e:
        logging.error(f"Erro em {serie_data['link']}: {e}")
        return None

async def coletar_detalhes(series_data):
    start_time = time.time()
    total = len(series_data)

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_and_parse(session, s, idx + 1, total) for idx, s in enumerate(series_data)]
        results = await asyncio.gather(*tasks)

    elapsed = time.time() - start_time
    logging.info(f"Coleta finalizada em {elapsed:.2f} segundos.")
    return [r for r in results if r]

# ---------------------- EXECUÇÃO NO JUPYTER ----------------------
series_data = coletar_links()
logging.info(f"{len(series_data)} links coletados. Iniciando coleta assíncrona...")

# No Jupyter, usar await
dados_completos = await coletar_detalhes(series_data)

# Resultado final
print(dados_completos)


Exception ignored in: <coroutine object coletar_detalhes at 0x1092817e0>
Traceback (most recent call last):
  File "<string>", line 1, in <lambda>
KeyError: '__import__'
Exception ignored in: <coroutine object coletar_detalhes at 0x1092817e0>
Traceback (most recent call last):
  File "<string>", line 1, in <lambda>
KeyError: '__import__'


MaxRetryError: HTTPConnectionPool(host='localhost', port=51828): Max retries exceeded with url: /session/811213ba62595f4041942b87a64d3173/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10a7f4e60>: Failed to establish a new connection: [Errno 61] Connection refused'))