# 🚀 Web Scraping do GLOBE com Selenium

Este Computador realiza web scraping no site do GLOBE, lidando com paginação e salvando os dados extraídos em JSON.

In [1]:

# Instalar as bibliotecas necessárias
!pip install selenium webdriver-manager
    

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.1 webdriver-manager-4.0.2


In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Configuração do navegador
options = Options()
options.add_argument("--headless")  # Rodar em background
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL base da pesquisa
base_url = "https://www.globe.gov/do-globe/research-resources/student-research-reports"
driver.get(base_url)
time.sleep(5)  # Espera inicial para carregamento da página

# Lista para armazenar os links extraídos
report_links = set()
page_count = 1
max_pages = 244  # Número máximo de páginas a percorrer

while page_count <= max_pages:
    try:
        print(f"📄 Processando página {page_count}...")

        # Capturar todos os links de relatórios na página atual
        reports = driver.find_elements(By.CSS_SELECTOR, "a[href*='/projectdetail/globe/']")
        for report in reports:
            link = report.get_attribute("href")
            if link and link not in report_links:
                report_links.add(link)
                print(f"✅ Extraído: {link}")

        # Captura o número de links antes de tentar carregar mais
        old_count = len(report_links)

        # Rolar a página até o final para carregar mais elementos
        for _ in range(5):  # Rola 5 vezes para garantir que novos elementos apareçam
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Esperar carregamento dos novos relatórios
        
        # Captura o número de links após o carregamento
        new_count = len(report_links)

        # Se o número de links não mudou, tentar clicar no botão "Next"
        if new_count == old_count:
            print("🔄 Nenhum novo link carregado após rolagem. Tentando clicar no botão 'Next'.")

            # Procurar e clicar no botão "Next"
            next_button = None
            possible_selectors = [
                "//a[contains(@aria-label, 'Next')]",
                "//a[contains(text(), 'Next')]",
                "//span[contains(text(), 'Next')]/ancestor::a",
                "//li[contains(@class, 'page-item') and not(contains(@class, 'disabled'))]/a"
            ]

            for selector in possible_selectors:
                buttons = driver.find_elements(By.XPATH, selector)
                if buttons:
                    next_button = buttons[0]
                    break

            # Se encontrou o botão "Next", clicar nele
            if next_button:
                driver.execute_script("arguments[0].scrollIntoView();", next_button)  # Rolar até o botão
                driver.execute_script("arguments[0].click();", next_button)  # Forçar clique via JavaScript
                time.sleep(7)  # Aguardar carregamento da nova página
                page_count += 1
            else:
                print("🚫 Nenhum botão 'Next' encontrado. Fim da navegação.")
                break

    except Exception as e:
        print(f"⚠️ Erro na página {page_count}: {str(e)}. Tentando novamente...")
        time.sleep(5)  # Pequena espera antes de tentar novamente
        continue  # Repetir a tentativa

# Fechar o navegador
driver.quit()

# Salvar os links extraídos
with open("report_links.txt", "w") as f:
    for link in report_links:
        f.write(link + "\n")

print(f"📂 Coleta concluída! {len(report_links)} links salvos em {page_count} páginas.")


📄 Processando página 1...
✅ Extraído: https://www.globe.gov/do-globe/research-resources/student-research-reports/-/projectdetail/globe/soil-characteristics-and-their-effect-on-the-quality-of-red-onion-plant-in-al-zulfi-governorate-1?backURL=https%3A%2F%2Fwww.globe.gov%3A443%2Fdo-globe%2Fresearch-resources%2Fstudent-research-reports%3Fp_p_id%3Dgov_globe_cms_projects_ProjectsWebPortlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26_gov_globe_cms_projects_ProjectsWebPortlet_reportTypes%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_titleFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_schoolNameFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_articleIdFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_yearFilter%3D0%26_gov_globe_cms_projects_ProjectsWebPortlet_orgFilterId%3D0%26_gov_globe_cms_projects_ProjectsWebPortlet_languageFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_gradeLevel%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_collegeCategory%3

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd

# Configuração do navegador
options = Options()
options.add_argument("--headless")  # Rodar sem abrir a janela
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Ler os links dos relatórios coletados
with open("report_links.txt", "r") as f:
    report_links = [line.strip() for line in f.readlines()]

# Lista para armazenar os dados extraídos
extracted_reports = []

# Processar cada relatório
for index, link in enumerate(report_links):
    try:
        driver.get(link)
        wait = WebDriverWait(driver, 10)  # Espera até 10s pelos elementos

        # Extrair os campos
        title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "h2"))).text

        def get_text(xpath):
            elements = driver.find_elements(By.XPATH, xpath)
            return elements[0].text if elements else "N/A"

        organization = get_text("//span[contains(text(),'Organization')]/following-sibling::span")
        country = get_text("//span[contains(text(),'Country')]/following-sibling::span")
        students = get_text("//span[contains(text(),'Student')]/following-sibling::span")
        protocols = get_text("//span[contains(text(),'Protocols')]/following-sibling::span")
        date_submitted = get_text("//span[contains(text(),'Date Submitted')]/following-sibling::span")

        extracted_reports.append({
            "Title": title,
            "Organization": organization,
            "Country": country,
            "Students": students,
            "Protocols": protocols,
            "Date Submitted": date_submitted,
            "Link": link
        })

        print(f"✅ Extraído ({index + 1}/{len(report_links)}): {title}")

    except Exception as e:
        print(f"⚠️ Erro ao processar {link}: {str(e)}")

# Fechar o navegador
driver.quit()

# Salvar os dados em um arquivo CSV
df = pd.DataFrame(extracted_reports)
df.to_csv("extracted_reports.csv", index=False)

print(f"📂 Extração concluída! {len(extracted_reports)} relatórios salvos.")


✅ Extraído (1/1215): GLOBE Main Banner
✅ Extraído (2/1215): GLOBE Main Banner
✅ Extraído (3/1215): GLOBE Main Banner
✅ Extraído (4/1215): GLOBE Main Banner
✅ Extraído (5/1215): GLOBE Main Banner
✅ Extraído (6/1215): GLOBE Main Banner
✅ Extraído (7/1215): GLOBE Main Banner
✅ Extraído (8/1215): GLOBE Main Banner
✅ Extraído (9/1215): GLOBE Main Banner
✅ Extraído (10/1215): GLOBE Main Banner
✅ Extraído (11/1215): GLOBE Main Banner
✅ Extraído (12/1215): GLOBE Main Banner
✅ Extraído (13/1215): GLOBE Main Banner
✅ Extraído (14/1215): GLOBE Main Banner
✅ Extraído (15/1215): GLOBE Main Banner
✅ Extraído (16/1215): GLOBE Main Banner
✅ Extraído (17/1215): GLOBE Main Banner
✅ Extraído (18/1215): GLOBE Main Banner
✅ Extraído (19/1215): GLOBE Main Banner
✅ Extraído (20/1215): GLOBE Main Banner
✅ Extraído (21/1215): GLOBE Main Banner
✅ Extraído (22/1215): GLOBE Main Banner
✅ Extraído (23/1215): GLOBE Main Banner
✅ Extraído (24/1215): GLOBE Main Banner
✅ Extraído (25/1215): GLOBE Main Banner
✅ Extraíd

### ✅ **Conclusão**
Este notebook executa o web scraping do site do GLOBE, navegando automaticamente entre as páginas e salvando os dados em JSON.