In [24]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

In [3]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 95.0.4638
Get LATEST chromedriver version for 95.0.4638 google-chrome
Driver [C:\Users\trsam\.wdm\drivers\chromedriver\win32\95.0.4638.69\chromedriver.exe] found in cache


In [4]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)

# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)
# line accomplishes two things: 1, searches for elements with a specific combo of tag (div) and attribute (list_tex) ul.item_list = <ul class="item_list". in HTML
# 2, telling the browser to wait one second before searching for the components

True

In [6]:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text') # looks for (div) tag and its decsendents, the parent element, pinpoints the (div) tag with the class of (list_tex)

In [7]:
slide_elem.find('div', class_='content_title') 
# use the find function within the slide_elem variable that was assigned in previous cell

<div class="content_title">7 Things to Know About the Mars 2020 Perseverance Rover Mission</div>

In [8]:
# Use the parent element to find the first "a" tag and save it as a "news_title"
news_title = slide_elem.find('div', class_='content_title').get_text() # only the text will be returned
news_title

# will result int he most recent title published on the website

'7 Things to Know About the Mars 2020 Perseverance Rover Mission'

In [9]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

"NASA's next rover to the Red Planet is slated to launch no earlier than July 30. These highlights will get you up to speed on the ambitious mission."

### Featured Images

In [19]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [20]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1] # The browser finds its element by its tag
full_image_elem.click() # Splinter will click the image to view its full size

In [21]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [22]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src') # an img is nested within this HTML, .get.('src') pulls the link to the image
img_url_rel 

# BeautifulSoup to look inside the img tag for an image with a class of fancybox-image "This is where the image we want lives - use the link that's inside these tags."

'image/featured/mars1.jpg'

In [23]:
# Use the base URL to the code
img_url = f'https://spaceimages_mars.com/{img_url_rel}'
img_url

'https://spaceimages_mars.com/image/featured/mars1.jpg'

In [26]:
df = pd.read_html('https://galaxyfacts-mars.com')[0] #creating a new DataFrame from the HTML table. the read_html() specifically searches for and returns a list of tables found in HTML. By specifying an index of 0, telling Pandas to pull only the first table it encounters or the first item in the list then it turns it into a DataFrame
df.columns = ['description', 'Mars', 'Earth']  # assigns columns to the new DataFrame for additional clarity
df.set_index('description', inplace=True) # turns the Description column into the DataFrame's index.  (inplace=True) means that the updated index will remain in place without reassigning the DataFrame to a new variable
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [27]:
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [28]:
browser.quit()