In [1]:
import os
import datetime
import re
import requests
import json
import urllib.request
import glob
from bs4 import BeautifulSoup

In [14]:
def scrape_json(url):
    
    #for dev, remove export files if present
    if os.path.exists("./embedded_json.json"):
        os.remove("./embedded_json.json")
    if os.path.exists("./linked_json.json"):
        os.remove("./linked_json.json")
    
    #fetch data from url
    fp = urllib.request.urlopen(url)
    raw_data = fp.read()
    data = raw_data.decode("utf8")
    fp.close()

    # convert data to bs4 html object
    html = BeautifulSoup(data, 'html.parser')
    
    # search html for json-ld information
    linked_json = html.head.find("link", type="application/ld+json")
    embedded_json = html.body.find("script", type="application/ld+json")

    # catch error if JSON data is invalid
    try:
        # if json file is linked in the header, download and save the linked file
        if linked_json:
            r = requests.get(linked_json["href"])
            file = "./output/" + url[-10:-5]    # need a convention
            with open (file, "wb") as f:
                f.write(r.content)
            return("file written from linked json")

        # if json information is embedded in body, parse text, convert to json, and save file
        elif embedded_json:
            ojson = re.sub(r'\n\s+',"",embedded_json.text).replace(r'<script type="application/ld+json">', "").replace(r'</script>',"")
            load_json = json.loads(ojson[1:-1])

            # add source and date metadata
            load_json.append({"source": url, "accessed": str(datetime.datetime.now())})
            # print(load_json)

            # export json
            json_out = json.dumps(load_json)
            file = "./output/" + url[-10:-5]     # need a convention
            with open(file, "w") as output:
                output.write(json_out)
            return("file written from embedded json")

        else:
            return("no json-ld data found")
    except json.JSONDecodeError:
        return f"ERROR: {site}\n{linked_json}\n{embedded_json}"

In [15]:
# Test against court sites

# url = 'http://localhost:8000/very-simple-test.html'

linked_json_sites = glob.glob("sites/linked/*")
embedded_json_sites = glob.glob("sites/embedded/*")
no_json_sites = glob.glob("sites/no-data/*")

all_sites = [linked_json_sites,embedded_json_sites,no_json_sites]

for sites in all_sites:
    for site in sites:
        scrape_json("http://localhost:8000/" + site)

# errors in embedded sites:
# il-adams, nm-bernalillo, mi-flint
# 2 WP sites, 1 Angular
# 2 working sites neither