In [124]:
import pandas as pd
import pymongo
import json

# Extract

In [148]:
# get the 2018 data occupation and demographics data
url = 'https://www.bls.gov/cps/cpsaat11.htm'
labor_force_stats = pd.read_html(url)
labor_force_stats = labor_force_stats[0]

# get the 2028 projected demographics data
url = 'https://www.bls.gov/emp/tables/civilian-labor-force-summary.htm'
demographics_2028 = pd.read_html(url)
demographics_2028 = demographics_2028[0]

# get the 2028 projected occupation data
url = 'https://www.bls.gov/emp/tables/occupational-projections-and-characteristics.htm'
occupations_2028 = pd.read_html(url)
occupations_2028 = occupations_2028[0]

# Transform

In [149]:
# clean up columns
labor_force_stats.columns = ['Occupation', 'Total Employed', 'Women', 'White',
                            'Black or African American', 'Asian', 'Hispanic or Latino']

# drop NAN's
labor_force_stats = labor_force_stats.dropna()

# delete the last row
labor_force_stats.drop(labor_force_stats.tail(1).index, inplace=True)
labor_force_stats.head()

Unnamed: 0,Occupation,Total Employed,Women,White,Black or African American,Asian,Hispanic or Latino
0,"Total, 16 years and over",155761,46.9,78.0,12.3,6.3,17.3
2,"Management, professional, and related occupations",62436,51.5,79.4,9.6,8.5,9.7
3,"Management, business, and financial operations...",25850,44.0,82.4,8.2,6.8,9.8
4,Management occupations,18263,40.0,83.9,7.6,5.9,10.3
5,Chief executives,1573,26.9,89.5,3.5,5.9,6.1


In [150]:
# clean up the 2028 projected demographics data
# ----------------------
# drop NAN's
demographics_2028 = demographics_2028.dropna()

# drop last row
demographics_2028.drop(demographics_2028.tail(1).index, inplace=False)

# get only the data we care about and rename columns
demographics_2028 = demographics_2028.iloc[[0,24,36,39,42,48],[0,4,14]]
demographics_2028.columns = ['Occupation', 'Total Employed', 'Total, 16 years and over']

# transpose to align with labor_force_stats dataframe
projections_2028 = demographics_2028.T

# clean up the table
projections_2028.reset_index(drop=False, inplace=True)
projections_2028.columns = ['Ocupation', 'Total Employed', 'Women', 'White', 'Black or African American', 'Asian', 'Hispanic or Latino']

projections_2028.iloc[1,1] = projections_2028.iloc[0,1]
projections_2028.drop(projections_2028.head(1).index, inplace=True)
projections_2028.drop(projections_2028.head(1).index, inplace=True)
projections_2028

Unnamed: 0,Ocupation,Total Employed,Women,White,Black or African American,Asian,Hispanic or Latino
2,"Total, 16 years and over",100.0,47.6,75.4,13.0,7.3,20.9


In [151]:
# clean up the occupation data
# ----------------------
# drop NAN's
occupations_2028 = occupations_2028.dropna()

# drop last row
occupations_2028.drop(occupations_2028.tail(1).index, inplace=True)

# get columns of interest and rename columns
occupations_2028 = occupations_2028.iloc[:,[0,4]]
occupations_2028.columns = ['Occupations', 'Total Employed']
occupations_2028

Unnamed: 0,Occupations,Total Employed
0,"Total, all occupations",169435.9
1,Management occupations,10900.2
2,Top executives,2844.8
3,Chief executives,248.8
4,General and operations managers,2541.4
...,...,...
1071,Wellhead pumpers,14.3
1072,Refuse and recyclable material collectors,143.9
1073,Mine shuttle car operators,1.3
1074,"Tank car, truck, and ship loaders",9.2


# Load

In [156]:
# connect to the mongo client
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# create occupations database
db = client.occupationsdb

# convert to a dictionary to insert into mongo
labor_dict_2018 = labor_force_stats.to_dict('record')

# insert into mongo database
db['2018'].insert_many(labor_dict_2018)

<pymongo.results.InsertManyResult at 0x12578dc48>

In [155]:
# load the 2028 data into mongo
# convert to a dictionary to insert into mongo
projections_2028_dict = projections_2028.to_dict('record')
occupations_2028_dict = occupations_2028.to_dict('record')

# insert into mongo database
db['2028'].insert_many(projections_2028_dict)
db['2028'].insert_many(occupations_2028_dict)

<pymongo.results.InsertManyResult at 0x12546ee48>

In [157]:
data = [{'headline': 'Valles Marineris Hemisphere Enhanced', 'teaser': "Teams with NASA's Mars 2020 and ESA's ExoMars practiced hunting for fossilized microbial life in the Australian Outback in preparation for their Red Planet missions. ", 'mars_featured_img_url': '/spaceimages/images/mediumsize/PIA18614_ip.jpg', 'mars_weather': 'InSight sol 346 (2019-11-16) low -101.5ºC (-150.8ºF) high -23.5ºC (-10.3ºF)\nwinds from the SSE at 4.8 m/s (10.8 mph) gusting to 20.0 m/s (44.7 mph)\npressure at 6.80 hPapic.twitter.com/zAXfs9KpgE', 'mars_facts_html': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Fact Name</th>\n      <th>Fact Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>Recorded By:</td>\n      <td>Egyptian astronomers</td>\n    </tr>\n  </tbody>\n</table>', 'hemispheres': [{'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif', 'title': 'Cerberus Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif', 'title': 'Schiaparelli Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif', 'title': 'Syrtis Major Hemisphere Enhanced'}, {'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif', 'title': 'Valles Marineris Hemisphere Enhanced'}]}]

In [161]:
data[0]['headline']

'Valles Marineris Hemisphere Enhanced'

In [163]:
# connect to the mongo client
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.mars_app

In [164]:
db['mars'].insert_many(data)

<pymongo.results.InsertManyResult at 0x1241c0388>