In [81]:
import os
import re

import pandas as pd
import requests

from pathlib import Path
from bs4 import BeautifulSoup

In [82]:
# Change this to False to download
force_download = False

In [83]:
# Some of these fields come from the 'Planets and Pluto: Phyiscal Characteristics' web page?
index_cols = [
    {'index': 0, 'attribute': 'planet', 'table_column_name': '<b>Planet</b>'},
    {'index': 1, 'attribute': 'equatorial_radius', 'table_column_name': '<b>Equatorial<br/>Radius</b>'},
    {'index': 2, 'attribute': 'mean_radius', 'table_column_name': '<b>Mean<br/>Radius</b>'},
    {'index': 3, 'attribute': 'mass', 'table_column_name': '<b>Mass</b>'},
    {'index': 4, 'attribute': 'bulk_density', 'table_column_name': '<b>Bulk<br/>Density</b>'},
    {'index': 5, 'attribute': 'sidereal_rotation_period', 'table_column_name': '<b>Sidereal<br/>Rotation Period</b>'},
    {'index': 6, 'attribute': 'sidereal_orbit_period', 'table_column_name': '<b>Sidereal<br/>Orbit Period</b>'},
    {'index': 7, 'attribute': 'v_1_0', 'table_column_name': '<b>V(1,0)</b>'},
    {'index': 8, 'attribute': 'geometric_albedo', 'table_column_name': '<b>Geometric<br/>Albedo</b>'},
    {'index': 9, 'attribute': 'equatorial_gravity', 'table_column_name': '<b>Equatorial<br/>Gravity</b>'},
    {'index': 10, 'attribute': 'escape_velocity', 'table_column_name': '<b>Escape<br/>Velocity</b>'},
    {'index': 11, 'attribute': 'begin_time', 'table_column_name': 'None'},
    {'index': 12, 'attribute': 'end_time', 'table_column_name': 'None'},
    {'index': 13, 'attribute': 'horizons', 'table_column_name': 'None'},
]

In [84]:
# Basic info about the planets.
planets = [
    {'name': 'Mercury', 'begin_time': '1999-10-05-00-00-00', 'end_time': '2000-01-01-00-00-00', 'horizons': '199', 'Moon Info': None},
    {'name': 'Venus', 'begin_time': '1999-05-21-07-12-00', 'end_time': '2000-01-01-00-00-00', 'horizons': '299', 'Moon Info': None},
    {'name': 'Earth', 'begin_time': '1998-12-31-19-12-01', 'end_time': '2000-01-01-00-00-00', 'horizons': '399', 'Moon Info': "Earth's Moon"},
    {'name': 'Mars', 'begin_time': '1998-02-13-00-00-01', 'end_time': '2000-01-01-00-00-00', 'horizons': '499', 'Moon Info': "Martian System"},
    {'name': 'Jupiter', 'begin_time': '1988-02-22-00-00-08', 'end_time': '2000-01-01-00-00-00', 'horizons': '599', 'Moon Info': "Jovian System"},
    {'name': 'Saturn', 'begin_time': '1970-07-30-00-00-23', 'end_time': '2000-01-01-00-00-00', 'horizons': '699', 'Moon Info': "Saturnian System"},
    {'name': 'Uranus', 'begin_time': '1916-04-02-00-00-32', 'end_time': '2000-01-01-00-00-00', 'horizons': '799', 'Moon Info': "Uranian Satellites"},
    {'name': 'Neptune', 'begin_time': '1836-04-10-00-00-32', 'end_time': '2000-01-01-00-00-00', 'horizons': '899', 'Moon Info': "Neptunian Satellites"}
]

In [85]:
# Which fields are used from the 'Planetary Fact Sheet - Metric' webpage?
coordinated_columns = [
    'Orbital Period'
]

In [86]:
# What are the URLs used?
planets_page = 'https://ssd.jpl.nasa.gov/?planet_phys_par'
moons_page = 'https://ssd.jpl.nasa.gov/?sat_phys_par'
coordinated_data_page = 'https://nssdc.gsfc.nasa.gov/planetary/factsheet/'

## Planets and Pluto: Phyiscal Characteristics processing.

In [87]:
# Open or fetch the file.
raw_planets_file = Path('data', 'raw_planets.html')
if raw_planets_file.is_file() and force_download == False:
    planets_soup = BeautifulSoup(open(Path('data', 'raw_planets.html')), 'lxml')
else:
    # Fetch the page
    planets_results = requests.get(planets_page)
    planets_content = planets_results.content
    # Save the file
    with open(Path('data', 'raw_planets.html'), 'wb') as f:
        f.write(planets_content)
    # Parse the page
    planets_soup = BeautifulSoup(planets_content, 'lxml')

In [88]:
# Try to find the item on the page with the class 'page_title'.
planets_table = planets_soup.find('div', 'page_title').findNext('table')

if planets_table:
    print('Planets table found.')

Planets table found.


In [89]:
# figure out which columns in the HTML table represent the values in index_cols.
# This assumes that the first row in the planets table represents the headers.
first_row = planets_table.find('tr')
for col in index_cols:
    table_pos = None
    i = 0
    for column in first_row.findAll('td'):
        if col['table_column_name'] == str(column.contents[0]):
            table_pos = i
            break
        else:
            i += 1
    if table_pos != None:
        index_cols[col['index']]['table_pos'] = table_pos
        print('Found column {}.'.format(col['attribute']))
    else:
        index_cols[col['index']]['table_pos'] = -1

Found column planet.
Found column equatorial_radius.
Found column mean_radius.
Found column mass.
Found column bulk_density.
Found column sidereal_rotation_period.
Found column sidereal_orbit_period.
Found column v_1_0.
Found column geometric_albedo.
Found column equatorial_gravity.
Found column escape_velocity.


In [90]:
# Parse data for planets.
rows = []
for planet in planets:
    val = {}
    planet_row = planets_table.find_all(string=planet['name'])
    parent_row = planet_row[0].find_parent('tr')

    # how many columns in html table?
    column_count = len(parent_row.find_all('td', recursive=False))

    for col in index_cols:
        if col['table_pos'] >= 0 and col['table_pos'] < column_count:
            
            found = parent_row.find_all('td', recursive=False)[col['table_pos']]
            if found.table == None:
                val[col['attribute']] = found.contents[0]
            else:
                val[col['attribute']] = parent_row.find_all('td', recursive=False)[col['table_pos']].table.tr.td.contents[0]
        else:
            val[col['attribute']] = planet[col['attribute']]
    
    rows.append(val)
    print('{} row parsed.'.format(planet['name']))

Mercury row parsed.
Venus row parsed.
Earth row parsed.
Mars row parsed.
Jupiter row parsed.
Saturn row parsed.
Uranus row parsed.
Neptune row parsed.


In [91]:
# Create an empty dataframe with the column names from index_cols.
planet_df_ = pd.DataFrame(columns = rows[0].keys())

# Append the parsed planet data to an empty data frame.
planet_df_ = planet_df_.append(rows)

# Set the index to the planet name.
planet_df_.set_index('planet')

# Convert columns where necessary
planet_df_ = planet_df_.astype({'equatorial_radius': float
                                , 'mean_radius': float
                                , 'mass': float
                                , 'bulk_density': float
                                , 'sidereal_rotation_period': float
                                , 'sidereal_orbit_period': float
                                , 'v_1_0': float
                                , 'geometric_albedo': float
                                , 'equatorial_gravity': float
                                , 'escape_velocity': float
                               })
planet_df_

Unnamed: 0,planet,equatorial_radius,mean_radius,mass,bulk_density,sidereal_rotation_period,sidereal_orbit_period,v_1_0,geometric_albedo,equatorial_gravity,escape_velocity,begin_time,end_time,horizons
0,Mercury,2440.53,2439.4,0.330114,5.4291,58.6462,0.240847,-0.6,0.106,3.7,4.25,1999-10-05-00-00-00,2000-01-01-00-00-00,199
1,Venus,6051.8,6051.8,4.86747,5.243,-243.018,0.615197,-4.47,0.65,8.87,10.36,1999-05-21-07-12-00,2000-01-01-00-00-00,299
2,Earth,6378.1366,6371.0084,5.97237,5.5136,0.99727,1.000017,-3.86,0.367,9.8,11.19,1998-12-31-19-12-01,2000-01-01-00-00-00,399
3,Mars,3396.19,3389.5,0.641712,3.9341,1.025957,1.880848,-1.52,0.15,3.71,5.03,1998-02-13-00-00-01,2000-01-01-00-00-00,499
4,Jupiter,71492.0,69911.0,1898.187,1.3262,0.41354,11.862615,-9.4,0.52,24.79,60.2,1988-02-22-00-00-08,2000-01-01-00-00-00,599
5,Saturn,60268.0,58232.0,568.336,0.6871,0.44401,29.447498,-8.88,0.47,10.44,36.09,1970-07-30-00-00-23,2000-01-01-00-00-00,699
6,Uranus,25559.0,25362.0,86.8127,1.27,-0.71833,84.016846,-7.19,0.51,8.87,21.38,1916-04-02-00-00-32,2000-01-01-00-00-00,799
7,Neptune,24764.0,24622.0,102.4126,1.638,0.67125,164.79132,-6.87,0.41,11.15,23.56,1836-04-10-00-00-32,2000-01-01-00-00-00,899


## Planetary Fact Sheet - Metric processing

In [92]:
# Open or fetch the file.
raw_coordinated_file = Path('data', 'raw_coordinated.html')
if raw_coordinated_file.is_file() and force_download == False:
    coordinated_soup = BeautifulSoup(open(Path('data', 'raw_coordinated.html')), 'lxml')
else:
    # Fetch the page
    coordinated_data_results = requests.get(coordinated_data_page)
    coordinated_data_content = coordinated_data_results.content
    # Save the file
    with open(Path('data', 'raw_coordinated.html'), 'wb') as f:
        f.write(coordinated_data_content)
    # Parse the page
    coordinated_soup = BeautifulSoup(planets_content, 'lxml')

In [93]:
# Find the lone table on the page.
data_table = coordinated_soup.find('table')

In [94]:
# figure out which column is which
first_row = data_table.find('tr')

# find the columns in the first row.
first_row_columns = first_row.find_all('td')

# hold the name of the planets in a list
planet_cols = []

# hold the data from the table
planet_data = []

# keep a counter of which row is being processed.
row_count = 0

# start with the second column
target_col = 1
while target_col < len(first_row_columns):
    planet_cols.append(first_row_columns[target_col].find('a').string)
    target_col += 1

print('{} columns found.'.format(len(planet_cols)))

10 columns found.


In [95]:
# Find al lthe rows in the data table.
rows = data_table.find_all('tr')
for row in rows:
    # The first and last row don't need to be processed.
    if row_count == 0 or row_count == len(rows):
        row_count += 1
        pass
    else:
        col_count = 0
        index = None
        columns = row.find_all('td')
        index = row.find('td').find('a').string
        for column in columns:
            if col_count == 0:
                col_count += 1
                pass
            else:
                planet_data.append({'index': index, 'column': col_count, 'value': column.string})
                col_count += 1
    row_count += 1

print('Planetary fact has {} rows'.format(len(rows)))

Planetary fact has 22 rows


In [96]:
# Hold the data from the coordinated data source
coordinated_data = []

# loop over the master list of planets
for planet in planets:
    planet_coordinated_data = {}
    
    # If the name from the master list of planets matches the coordinated planet list...
    planet_coordinated_data['planet'] = planet['name']
    
    # Loop to get only the desired fields from planet_data.
    for coordinated_field in coordinated_columns:
        value = next(item for item in planet_data if item['index'] == coordinated_field 
            and item['column'] == planet_cols.index(planet['name'].upper()) + 1)['value']
        planet_coordinated_data[coordinated_field] = value
    coordinated_data.append(planet_coordinated_data)

print('Coordinated data has {} entries.'.format(len(coordinated_data)))

Coordinated data has 8 entries.


In [97]:
# Create an empty dataframe with the column names from coordinated data.
planet_coordinated_df_ = pd.DataFrame(columns = coordinated_data[0].keys())

# Append the parsed coordinated planet data to an empty data frame.
planet_coordinated_df_ = planet_coordinated_df_.append(coordinated_data)

# Set the index to the planet name.
planet_coordinated_df_.set_index('planet')

print('Dataframe shape {}.'.format(planet_coordinated_df_.shape))
planet_coordinated_df_

Dataframe shape (8, 2).


Unnamed: 0,planet,Orbital Period
0,Mercury,88.0
1,Venus,224.7
2,Earth,365.2
3,Mars,687.0
4,Jupiter,4331.0
5,Saturn,10747.0
6,Uranus,30589.0
7,Neptune,59800.0


In [None]:
# Join the planets and coordinated planet data.
planet_df_ = pd.merge(planet_df_, planet_coordinated_df_, left_on=['planet'], right_on=['planet'], how='inner')
planet_df_.set_index('planet')

# Add a field called full_Name
planet_df_['full_name'] = "[" + planet_df_['horizons'] + "] " + planet_df_['planet']

# Add a field called diameter
planet_df_['diameter'] = planet_df_['mean_radius'] * 2.0

# Add a name.
planet_df_['name'] = planet_df_['planet']

# Select field to include in the dataset.
planet_df_ = planet_df_[['planet', 'full_name', 'diameter', 'mean_radius', 'end_time', 'begin_time', 'horizons', 'planet', 'Orbital Period']]

In [99]:
# Check to see if the file exists...
output_path = Path('data', 'planets.csv')
if output_path.is_file():
    os.remove(output_path)

# Write the csv
planet_df_.to_csv(Path('data', 'planets.csv')
                  , header=['name', 'full_name', 'diameter', 'mean_radius', 'end_time', 'begin_time', 'horizons', 'class', 'per']
                  , index=False)

if output_path.is_file():
    print('{} written'.format(output_path))
else:
    print('{} not written'.format(output_path))

data\planets.csv written
