# 🚀 Web Scraping do GLOBE com Selenium

Este Computador realiza web scraping no site do GLOBE, lidando com paginação e salvando os dados extraídos em JSON.

In [1]:

# Instalar as bibliotecas necessárias
!pip install selenium webdriver-manager
    

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.1 webdriver-manager-4.0.2


In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Configuração do navegador
options = Options()
options.add_argument("--headless")  # Rodar sem abrir a janela
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL principal
url = "https://www.globe.gov/do-globe/research-resources/student-research-reports?p_p_id=gov_globe_cms_projects_ProjectsWebPortlet&_gov_globe_cms_projects_ProjectsWebPortlet_titleFilter=&_gov_globe_cms_projects_ProjectsWebPortlet_schoolNameFilter=&_gov_globe_cms_projects_ProjectsWebPortlet_articleIdFilter=&_gov_globe_cms_projects_ProjectsWebPortlet_reportTypes=&_gov_globe_cms_projects_ProjectsWebPortlet_yearFilter=0&_gov_globe_cms_projects_ProjectsWebPortlet_orgFilterId=0&_gov_globe_cms_projects_ProjectsWebPortlet_languageFilter=&_gov_globe_cms_projects_ProjectsWebPortlet_gradeLevel=&_gov_globe_cms_projects_ProjectsWebPortlet_collegeCategory=&_gov_globe_cms_projects_ProjectsWebPortlet_protocolIds=&_gov_globe_cms_projects_ProjectsWebPortlet_sortCol=4&_gov_globe_cms_projects_ProjectsWebPortlet_displayStart=0"
driver.get(url)
time.sleep(5)  # Aguarda o carregamento da página

# Lista para armazenar os links extraídos
report_links = set()
#max_pages = 5  # Número de páginas a percorrer (teste em amostra)

# Verificar total de páginas
try:
    page_selector = Select(driver.find_element(By.ID, "pagelist"))
    total_pages = len(page_selector.options)
    print(f"🔍 Total de páginas encontradas: {total_pages}")
except:
    print("🚫 Erro ao encontrar o seletor de páginas.")
    driver.quit()
    exit()

# Percorrer todas as páginas
for page in range(total_pages):
    print(f"\n📄 Extraindo links da página {page + 1}/{total_pages}...")

    # Coletar os links corretamente
    project_elements = driver.find_elements(By.CSS_SELECTOR, ".srr-list-item a[href*='/projectdetail/globe/']")
    page_links = {el.get_attribute("href") for el in project_elements if el.get_attribute("href")}

    # Adicionar links novos
    new_links = page_links - report_links
    report_links.update(new_links)

    # Exibir progresso
    for link in new_links:
        print(f"✅ {link}")

    print(f"🔍 Total de links extraídos até agora: {len(report_links)}")

    # Avançar para a próxima página se houver mais
    if page < total_pages - 1:
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # Rola até o final da página
            time.sleep(3)  # Aguarda antes de mudar de página
            page_selector = Select(driver.find_element(By.ID, "pagelist"))
            page_selector.select_by_index(page + 1)
            time.sleep(5)  # Aguardar carregamento da nova página
        except:
            print("🚫 Erro ao mudar de página.")
            break

# Fechar navegador
driver.quit()

# Salvar os links extraídos
with open("report_links.txt", "w") as f:
    for link in report_links:
        f.write(link + "\n")

print(f"\n📂 Extração concluída! {len(report_links)} links salvos em 'report_links.txt'.")


🔍 Total de páginas encontradas: 244

📄 Extraindo links da página 1/244...
✅ https://www.globe.gov/do-globe/research-resources/student-research-reports/-/projectdetail/globe/pine-forests-in-labin-s-yesterday-today-tomorrow?backURL=https%3A%2F%2Fwww.globe.gov%3A443%2Fdo-globe%2Fresearch-resources%2Fstudent-research-reports%3Fp_p_id%3Dgov_globe_cms_projects_ProjectsWebPortlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26_gov_globe_cms_projects_ProjectsWebPortlet_reportTypes%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_titleFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_schoolNameFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_articleIdFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_yearFilter%3D0%26_gov_globe_cms_projects_ProjectsWebPortlet_orgFilterId%3D0%26_gov_globe_cms_projects_ProjectsWebPortlet_languageFilter%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_gradeLevel%3D%26_gov_globe_cms_projects_ProjectsWebPortlet_collegeCategory%3D%26_gov_gl

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time

# Carregar links já extraídos
with open("report_links.txt", "r") as f:
    extracted_links = set(f.read().splitlines())

# Configuração do navegador
options = Options()
options.add_argument("--headless")  # Rodar sem abrir a janela
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# URL principal
url = "https://www.globe.gov/do-globe/research-resources/student-research-reports"
driver.get(url)
wait = WebDriverWait(driver, 10)
time.sleep(5)

# Lista para armazenar os novos links extraídos
new_scraping_links = set()

# Verificar total de páginas
try:
    page_selector = wait.until(EC.presence_of_element_located((By.ID, "pagelist")))
    total_pages = len(Select(page_selector).options)
    print(f"🔍 Total de páginas encontradas: {total_pages}")
except:
    print("🚫 Erro ao encontrar o seletor de páginas.")
    driver.quit()
    exit()

# Percorrer todas as páginas novamente
for page in range(total_pages):
    print(f"\n📄 Extraindo links da página {page + 1}/{total_pages}...")

    try:
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".srr-list-item a[href*='/projectdetail/globe/']")))
    except:
        print("🚫 Nenhum link encontrado nesta página.")
        continue

    project_elements = driver.find_elements(By.CSS_SELECTOR, ".srr-list-item a[href*='/projectdetail/globe/']")
    page_links = {el.get_attribute("href") for el in project_elements if el.get_attribute("href")}

    new_scraping_links.update(page_links)

    if page < total_pages - 1:
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            page_selector = wait.until(EC.element_to_be_clickable((By.ID, "pagelist")))
            select = Select(page_selector)
            select.select_by_index(page + 1)

            time.sleep(5)
        except:
            print("🚫 Erro ao mudar de página.")
            break

# Fechar navegador
driver.quit()

# Comparar os links extraídos com os do arquivo salvo
missing_links = new_scraping_links - extracted_links

# Exibir os links que faltam
if missing_links:
    print("\n🔎 Links faltantes encontrados:")
    for link in missing_links:
        print(f"❌ {link}")

    # Salvar os links faltantes em um novo arquivo
    with open("missing_links.txt", "w") as f:
        for link in missing_links:
            f.write(link + "\n")

    print(f"\n📂 {len(missing_links)} links faltantes salvos em 'missing_links.txt'.")
else:
    print("\n✅ Nenhum link faltando foi encontrado, todos já estavam no arquivo!")



🔍 Total de páginas encontradas: 244

📄 Extraindo links da página 1/244...

📄 Extraindo links da página 2/244...

📄 Extraindo links da página 3/244...

📄 Extraindo links da página 4/244...

📄 Extraindo links da página 5/244...

📄 Extraindo links da página 6/244...

📄 Extraindo links da página 7/244...

📄 Extraindo links da página 8/244...

📄 Extraindo links da página 9/244...

📄 Extraindo links da página 10/244...

📄 Extraindo links da página 11/244...

📄 Extraindo links da página 12/244...

📄 Extraindo links da página 13/244...

📄 Extraindo links da página 14/244...

📄 Extraindo links da página 15/244...

📄 Extraindo links da página 16/244...

📄 Extraindo links da página 17/244...

📄 Extraindo links da página 18/244...

📄 Extraindo links da página 19/244...

📄 Extraindo links da página 20/244...

📄 Extraindo links da página 21/244...

📄 Extraindo links da página 22/244...

📄 Extraindo links da página 23/244...

📄 Extraindo links da página 24/244...

📄 Extraindo links da página 25/244..

In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import time

# Configuração do navegador
options = Options()
options.add_argument("--headless")  # Rodar sem abrir a janela
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Ler os links dos relatórios coletados
with open("report_links.txt", "r") as f:
    report_links = [line.strip() for line in f.readlines()]

# Lista para armazenar os dados extraídos
extracted_reports = []

# Função auxiliar para extrair texto de elementos
def get_text(xpath):
    try:
        element = driver.find_element(By.XPATH, xpath)
        return element.text.strip() if element else "N/A"
    except:
        return "N/A"

# Processar **TODOS** os links da lista
for index, link in enumerate(report_links):
    try:
        driver.get(link)
        wait = WebDriverWait(driver, 10)  # Espera até 10s pelos elementos

        # Extração do título
        title = wait.until(EC.presence_of_element_located((By.XPATH, "//div[@id='report-container']/h2"))).text.strip()
        
        organization = get_text("//span[contains(text(),'Organization')]/following-sibling::span")
        students = get_text("//span[contains(text(),'Student')]/following-sibling::span")
        grade_level = get_text("//span[contains(text(),'Grade Level')]/following-sibling::span")
        educator = get_text("//span[contains(text(),'GLOBE Educator')]/following-sibling::span")
        contributors = get_text("//span[contains(text(),'Contributors')]/following-sibling::span")
        report_type = get_text("//span[contains(text(),'Report Type')]/following-sibling::span")
        protocols = get_text("//span[contains(text(),'Protocols')]/following-sibling::span")
        language = get_text("//span[contains(text(),'Language')]/following-sibling::span")
        date_submitted = get_text("//span[contains(text(),'Date Submitted')]/following-sibling::span")
        
        # Link do "View Research Report"
        try:
            report_link = driver.find_element(By.XPATH, "//a[b[contains(text(),'View Research Report')]]").get_attribute("href")
        except:
            report_link = "N/A"

        # Link do "Presentation Poster"
        try:
            poster_link = driver.find_element(By.XPATH, "//a[contains(text(),'View Document')]").get_attribute("href")
        except:
            poster_link = "N/A"

        # Salvar dados extraídos
        extracted_reports.append({
            "Title": title,
            "Organization": organization,
            "Students": students,
            "Grade Level": grade_level,
            "GLOBE Educator(s)": educator,
            "Contributors": contributors,
            "Report Type(s)": report_type,
            "Protocols": protocols,
            "Presentation Poster": poster_link,
            "Language(s)": language,
            "Date Submitted": date_submitted,
            "View Research Report": report_link,
            "Original Link": link
        })

        print(f"✅ ({index + 1}/{len(report_links)}) Extraído: {title}")

    except Exception as e:
        print(f"⚠️ Erro ao processar {link}: {str(e)}")

    # Pequeno atraso para evitar bloqueios
    time.sleep(1)

# Fechar o navegador
driver.quit()

# Salvar os dados em um arquivo JSON
with open("extracted_reports.json", "w", encoding="utf-8") as f:
    json.dump(extracted_reports, f, indent=4, ensure_ascii=False)

print(f"\n📂 Extração concluída! {len(extracted_reports)} relatórios salvos em 'extracted_reports.json'.")


✅ (1/2409) Extraído: Garonne, a river under surveillance
✅ (2/2409) Extraído: WOODLANDS NEAR THE SHORE OF THE BALTIC SEA IN MATSI
✅ (3/2409) Extraído: Scientific research on The effect of well water and soil and its relationship to agriculture in Tabuk
✅ (4/2409) Extraído: Sample study of polluted water gathered at the entrance to the school
✅ (5/2409) Extraído: The Silent Danger
✅ (6/2409) Extraído: Examining the effects of rising sea temperatures on the germination, growth and survival rate of Enhalus acoroides.
✅ (7/2409) Extraído: Taking Learning Outside
✅ (8/2409) Extraído: Do clouds form due to temperature
✅ (9/2409) Extraído: ESTUDIO PRELIMINAR DESCRIPTIVO DE POTENCIALES HABITATS DE MOSQUITOS EN LA COMUNIDAD DEL COLEGIO COOPERATIVO C. A. R. B DE APARTADÓ ANTIOQUIA Y CIRCUNDANTES Y SU INCIDENCIA EN LA SALUD COMUNITARIA.
✅ (10/2409) Extraído: TerraRover 2 Engineered to Detect and Record Potential Atmospheric Implications of the 2024 Solar Eclipse
✅ (11/2409) Extraído: Study on the

### ✅ **Conclusão**
Este notebook executa o web scraping do site do GLOBE, navegando automaticamente entre as páginas e salvando os dados em JSON.