# Scrape data of stay-at-home orders

Data is scraped from

https://www.kff.org/coronavirus-policy-watch/stay-at-home-orders-to-fight-covid19/

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pymongo import MongoClient
import pprint

import copy
import pandas as pd

# Requests sends and recieves HTTP requests.
import requests

# Beautiful Soup parses HTML documents in python.
from bs4 import BeautifulSoup

### Step 1: Check out the website in a browser.

In [3]:
stay_at_home_url = 'https://www.kff.org/coronavirus-policy-watch/stay-at-home-orders-to-fight-covid19/'

### Step 2: Send a GET request for the data.

In [4]:
r = requests.get(stay_at_home_url)

In [5]:
r.status_code

200

### Step 3: Save all the hypertext into mongo for later use.

In [7]:
client = MongoClient('localhost', 27017)
db = client.kff_org
pages = db.pages

pages.insert_one({'html': r.content})

<pymongo.results.InsertOneResult at 0x120309500>

### Step 4: Parse the hypertext with BeautifulSoup

In [8]:
soup = BeautifulSoup(r.content, "html")

In [42]:
#print(soup.prettify())

### Step 5: Navigate the data to pull out the table information.

In [57]:
table = soup.find("table", {"class": 
                            "less-padding alternate-highlights-gray"})
rows = table.find_all("tr")

rows[3].text.split('\n')

#print(type(table))
#print(type(rows))

['', 'Alaska', 'March 27', 'March 28', '']

In [62]:
all_rows = []

# Store each row as a dictionary

empty_row = {
    "state": None, "Date Announced": None, "Effective Date": None
}

# The first two rows contains header information, skip it. 
# The last row contains footer information, skip that too.

for row in rows[2:len(rows)-1]:
    new_row = copy.copy(empty_row)
    each_row = row.text.split('\n')
    new_row['state'] = each_row[1]
    new_row['Date Announced'] = each_row[2]
    new_row['Effective Date'] = each_row[3]

    all_rows.append(new_row)

In [64]:
#all_rows

### Step 6: Load all the rows into a Mongo database.

In [65]:
db = client.kff_org
stay_home_states = db.stay_home_states

In [66]:
for row in all_rows:
    stay_home_states.insert_one(row)

### Step 7: Load all the rows into a pandas dataframe

In [68]:
stay_home_orders_state = pd.DataFrame(all_rows)

### Step 8: Save pandas dataframe to a CSV.

In [70]:
stay_home_orders_state.to_csv('datasets/stay_home_orders_state.csv')

### Step 9: Show the table

In [71]:
stay_home_orders_state

Unnamed: 0,state,Date Announced,Effective Date,_id
0,Alabama,April 3,April 4,5e8b9794aded4fdccab0eca4
1,Alaska,March 27,March 28,5e8b9794aded4fdccab0eca5
2,Arizona,March 30,March 31,5e8b9794aded4fdccab0eca6
3,Arkansas,–,–,5e8b9794aded4fdccab0eca7
4,California,March 19,March 19,5e8b9794aded4fdccab0eca8
5,Colorado,March 26,March 26,5e8b9794aded4fdccab0eca9
6,Connecticut,March 20,March 23,5e8b9794aded4fdccab0ecaa
7,Delaware,March 22,March 24,5e8b9794aded4fdccab0ecab
8,District of Columbia,March 30,April 1,5e8b9794aded4fdccab0ecac
9,Florida,April 1,April 3,5e8b9794aded4fdccab0ecad
