In [None]:
from bs4 import BeautifulSoup
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import os
os.environ["LANG"] = "en_US.UTF-8"

In [None]:

base_url = 'https://llis.nasa.gov/lesson/'

In [None]:

ff_options = Options()
ff_options.add_argument("--headless")

driver = webdriver.Firefox(options=ff_options)

def collect_lesson_links(base_url):
    links = []
    try:
        driver.get(base_url)
        # Wait for dynamic content to load    
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a[href^="/lesson/"]'))
        )
        # Collect all links that begin with "/lesson/"
        links = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, 'a[href^="/lesson/"]')]
        # Make links absolute
        links = [f"https://llis.nasa.gov{link}" if link.startswith('/lesson/') else link for link in links]
    except:
        print("Error getting links from: " + base_url)
    return links

def scrape_lesson_content(links):
    data = []
    for link in links:
        driver.get(link)
        # Allow some time for the page to load and JavaScript to render content
        time.sleep(5)
        # Use BeautifulSoup to parse page source and extract <p> text
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            # Ensure that paragraph text is not empty or repetitive
            paragraph_text = p.text.strip()
            if paragraph_text and paragraph_text not in [row[1] for row in data]:
                data.append([link, paragraph_text])
    return data

In [None]:
master_list = []

for i in range(1, 10000):
    test_link = base_url + str(i)    
    # try to collect some links
    links = collect_lesson_links(test_link)
    # See if anything was returned
    if len(links) > 0:
        # Lesson links were found, append the initial link        
        # make sure the test link doesn't exist already
        if test_link not in master_list:
            master_list.append(test_link)
        # Check that each link doesn't exist in master list
        for link in links:
            if link not in master_list:
                master_list.append(link)



In [None]:
print("Found: ", len(master_list), " links.")
master_list.sort()

In [None]:
df = pd.DataFrame(master_list, columns=['URL'])
df.to_csv('./Downloads/urls.csv', index=False)

In [None]:

# Scrape content from each link
#content = scrape_lesson_content(links)

# Convert the list to a DataFrame
#df = pd.DataFrame(content, columns=['URL', 'Paragraph'])

# Save the DataFrame to a CSV file
#df.to_csv('./Downloads/lesson_content.csv', index=False)

In [None]:
driver.quit()