In [1]:
# Native
import io
import os
import re
import time

# Third party
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist

In [10]:
def init_browser_bot(url, preload=False):
    global browser, executable_path, html, soup
    
    try:
        browser.visit(url)
    except:
        executable_path = {'executable_path': 'chromedriver.exe'}
        browser = Browser('chrome', **executable_path, headless=False)
        browser.visit(url)
    
    if preload:
        time.sleep(3)
        for _ in range(3):
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
    html = browser.html
    soup = bs(html, 'html.parser')

In [6]:
def scrape_mars_NASA_articles():
    url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
    init_browser_bot(url)
    
    articles = []
    pageHasLoaded = False

    while not pageHasLoaded:
        if(browser.is_element_present_by_css("li.slide", wait_time=5)):
            pageHasLoaded = True
            results = soup.find_all('div', class_='list_text')

            for result in results:
                title = result.find('a').text
                teaser = result.find("div", class_="article_teaser_body").text
                articles.append((title, teaser))
                # print(f"{title}:\n{teaser}\n")
                
    return articles

scrape_mars_NASA_articles()

[("From JPL's Mailroom to Mars and Beyond",
  'Bill Allen has thrived as the mechanical systems design lead for three Mars rover missions, but he got his start as a teenager sorting letters for the NASA center.'),
 ("5 Hidden Gems Are Riding Aboard NASA's Perseverance Rover",
  "The symbols, mottos, and small objects added to the agency's newest Mars rover serve a variety of purposes, from functional to decorative."),
 ('MOXIE Could Help Future Rockets Launch Off Mars',
  "NASA's Perseverance rover carries a device to convert Martian air into oxygen that, if produced on a larger scale, could be used not just for breathing, but also for fuel."),
 ("Hear Audio From NASA's Perseverance As It Travels Through Deep Space",
  "The first to be rigged with microphones, the agency's latest Mars rover picked up the subtle sounds of its own inner workings during interplanetary flight."),
 ('Mars Is Getting a New Robotic Meteorologist',
  "Sensors on NASA's Perseverance will help prepare for future

In [7]:
def scrape_mars_NASA_featured_image():
    url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
    init_browser_bot(url)
    
    featured_image_url = ""
    pageHasLoaded = False

    while not pageHasLoaded:
        if(browser.is_element_present_by_id("page", wait_time=5)):
            pageHasLoaded = True
            browser.click_link_by_id('full_image')
            browser.click_link_by_partial_text('more info')
            browser.click_link_by_partial_href('//photojournal.jpl.nasa.gov/jpeg/')
            featured_image_url = browser.url
            # print(f'Featured image url: {featured_image_url}')
            
    return featured_image_url
            
scrape_mars_NASA_featured_image()



'https://photojournal.jpl.nasa.gov/jpeg/PIA18903.jpg'

In [14]:
def scrape_mars_weather_tweets():
    url = "https://twitter.com/marswxreport?lang=en"
    init_browser_bot(url, True)
    
    tweets = []
    pageHasLoaded = False

    while not pageHasLoaded:
        if(browser.is_element_present_by_css('div[data-testid="tweet"]', wait_time=5)):
            pageHasLoaded = True
            results = re.findall(r'InSight sol.*?<', html, re.DOTALL)
            
            for result in results:
                    tweet = result
                    if 'InSight sol' in tweet:
                        tweets.append(tweet[:-1].replace("hPa", "hPa "))

    return tweets
scrape_mars_weather_tweets()

['InSight sol 689 (2020-11-03) low -96.8ºC (-142.2ºF) high -18.4ºC (-1.0ºF)\nwinds from the WNW at 8.5 m/s (18.9 mph) gusting to 25.7 m/s (57.5 mph)\npressure at 7.50 hPa ',
 'InSight sol 688 (2020-11-02) low -95.7ºC (-140.2ºF) high -12.8ºC (9.0ºF)\nwinds from the WNW at 5.4 m/s (12.2 mph) gusting to 17.2 m/s (38.5 mph)\npressure at 7.40 hPa ',
 'InSight sol 687 (2020-11-01) pressure at 7.40 hPa ',
 'InSight sol 686 (2020-10-31) pressure at 7.40 hPa ',
 'InSight sol 685 (2020-10-30) low -97.2ºC (-142.9ºF) high -11.9ºC (10.6ºF)\nwinds from the W at 6.4 m/s (14.3 mph) gusting to 18.3 m/s (41.0 mph)\npressure at 7.40 hPa ',
 'InSight sol 684 (2020-10-29) low -95.5ºC (-139.8ºF) high -17.9ºC (-0.1ºF)\npressure at 7.40 hPa ',
 'InSight sol 681 (2020-10-25) low -95.4ºC (-139.8ºF) high -4.4ºC (24.0ºF)\nwinds from the WNW at 5.6 m/s (12.6 mph) gusting to 18.6 m/s (41.6 mph)\npressure at 7.40 hPa ',
 'InSight sol 676 (2020-10-21) low -96.9ºC (-142.4ºF) high -16.5ºC (2.3ºF)\nwinds from the W at 8

In [20]:
def scrape_mars_space_facts_html_table():
    url = "https://space-facts.com/mars/"
    space_facts_html = pd.read_html(url)
    stringBuffer = io.StringIO()
    facts_df = pd.DataFrame({"Property": space_facts_html[0][0], "Value": space_facts_html[0][1]})
    facts_df.set_index("Property").to_html(buf=stringBuffer, classes='table table-striped')
    facts_html_table = stringBuffer.getvalue()
    
    return facts_html_table

scrape_mars_space_facts_html_table()

<table border="1" class="dataframe table table-striped">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Value</th>
    </tr>
    <tr>
      <th>Property</th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Equatorial Diameter:</th>
      <td>6,792 km</td>
    </tr>
    <tr>
      <th>Polar Diameter:</th>
      <td>6,752 km</td>
    </tr>
    <tr>
      <th>Mass:</th>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <th>Moons:</th>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <th>Orbit Distance:</th>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <th>Orbit Period:</th>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <th>Surface Temperature:</th>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <th>First Record:</th>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <th>Recorded By:</th>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


In [21]:
def get_mars_hemispheres():
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    init_browser_bot(url)
    
    pageHasLoaded = False

    while not pageHasLoaded:
        if(browser.is_element_present_by_tag("section", wait_time=5)):
            pageHasLoaded = True
            results = soup.find_all('div', class_='item') 
            counter = -1

            for result in results:
                counter += 1

                names = soup.find_all('h3')
                name = names[counter].get_text().rstrip("Enhanced")
                print(name)

                browser.find_by_tag("h3")[counter].click()
                html2 = browser.html
                soup2 = bs(html2, 'html.parser')
                new_url = soup2.find("a", string="Sample").get('href')
                browser.visit(new_url)
                img_url = browser.url
                print(img_url)
                hemisphere_image_urls.append({"title": name, "img_url": img_url})

                browser.back()
                browser.back()
   
get_mars_hemispheres()

Cerberus Hemisphere 
https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg
{a: text= 'Sample'} element was not found.


In [31]:
news_title = titles[0]
news_p = teasers[0]
mars_weather = tweets[0]
hemisphere_image_urls = hemisphere_image_urls

In [32]:
browser.quit()