In [45]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd 

In [6]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\hulya\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache


In [7]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)


True

In [18]:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')
slide_elem

<div class="list_text">
<div class="list_date">January 24, 2022</div>
<div class="content_title">NASA's Mars 2020 Rover Closer to Getting Its Name</div>
<div class="article_teaser_body">155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July.</div>
</div>

In [19]:
slide_elem.find('div', class_='content_title')

<div class="content_title">NASA's Mars 2020 Rover Closer to Getting Its Name</div>

In [20]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title


"NASA's Mars 2020 Rover Closer to Getting Its Name"

In [21]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p


"155 students from across the U.S. have been chosen as semifinalists in NASA's essay contest to name the Mars 2020 rover, and see it launch from Cape Canaveral this July."

### Featured Images 

In [36]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')


In [37]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
# full_image_elem.click()
full_image_elem

<splinter.driver.webdriver.WebDriverElement at 0x23611540e50>

In [43]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='thumbimg').get('src')
img_url_rel


'image/mars/Icaria Fossae7.jpg'

In [44]:
# If we look at our address bar in the webpage, we can see 
# the entire URL up there already; we just need to add the first portion to our app.
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url


'https://spaceimages-mars.com/image/mars/Icaria Fossae7.jpg'

In [46]:
# The Pandas function read_html() specifically searches for and 
# returns a list of tables found in the HTML. By specifying an index of 
# 0, we're telling Pandas to pull only the first table it encounters,
# or the first item in the list. Then, it turns the table into a DataFrame.
df = pd.read_html('https://galaxyfacts-mars.com')[0]
# •	df.columns=['description', 'Mars', 'Earth'] Here, we assign columns 
# to the new DataFrame for additional clarity.
# •	df.set_index('description', inplace=True) By using the .set_index() 
# function, we're turning the Description column 
# into the DataFrame's index. inplace=True means that the updated index 
# will remain in place, without having to reassign the DataFrame to a new variable.
# Now, when we call the DataFrame, we're presented with a tidy, 
# Pandas-friendly representation of the HTML table we were just viewing 
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace=True)
df


Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [47]:
# How do we add the DataFrame to a web application? Robin's web app is going to be an actual webpage
# Pandas also has a way to easily convert our DataFrame back into HTML-ready code using the .to_html() function
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [48]:
# we can end the automated browsing session. This is an important line to 
# add to our web app also. Without it, the automated browser won't know to 
# shut down—it will continue to listen for instructions and use the computer's
# resources (it may put a strain on memory or a laptop's battery if left on).
#            We really only want the automated browser to remain active while
#            we're scraping data. It's like turning off a light switch when 
#            you're ready to leave the room or home. 
browser.quit()

In [None]:
# we can't automate the scraping using the Jupyter Notebook. 
# To fully automate it, it will need to be converted into a .py file.