In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
from pyshacl import validate
import glob
import os
import regex as re
import json
import requests
import datetime
from rdflib import plugin
from rdflib.graph import Graph
from rdflib.store import Store
from rdflib_sqlalchemy import registerplugins
from sqlalchemy import insert
import sqlite3

### Crawl Webpages and Scrape JSON-LD Information

In [2]:
### dev ###

# clear destination dir
files = glob.glob("./raw_json/*")
for file in files:
    os.remove(file)

# use local files
dev_sites = glob.glob("sites/*/*.html")

### /dev ###

# list of sites to scrape
sites = dev_sites
class JsonSpider(scrapy.Spider):
    
    """ 
    scrape .json-ld data from court websites.
    prefer linked .json, 
    scrape embedded data otherwise.
    """

    name = "court-data-spider"

    def start_requests(self):
          
        global sites
        
        # GET request, pass res to parse()
        for url in sites:
            yield scrapy.Request(url=f"http://localhost:8000/{url}", callback=self.parse)

            
    def parse(self, response):

        # look for json data. 
        linked_json = response.selector.xpath(
            '//link[@type="application/ld+json"]/@href').get()
        embedded_json = response.selector.xpath(
            '//script[@type="application/ld+json"]/text()').get()

        # use page source as filename, replace "/" 
        # need a better convention
        page_source = response.url.replace("/", ".")
        filename = (page_source + ".json").replace(".html","")

        
        if (linked_json is not None) or (embedded_json is not None):    
            if linked_json is not None:
                # follow link to json file and grab data
                req = requests.get(linked_json)
                # to append source and date metadata below
                load_json = json.loads(req.content)        
            elif embedded_json is not None:
                # parse json data from html source
                # remove whitespace that is not in a value
                embedded_json = re.sub(r'\s+[^\:\S\"]', "", embedded_json)  
                # to append source and date metadata below
                load_json = json.loads(embedded_json[1:-1])

            # append source and date metadata
            load_json.append(
                {"source": response.url, "accessed": str(datetime.datetime.now())})
            
            # write json file
            json_out = json.dumps(load_json)
            with open(f"raw_json/{filename}", "w") as output:
                output.write(json_out)

process = CrawlerProcess(
    # requests throttled due to limitations of python http.server
    settings = {
        "DOWNLOAD_DELAY": 1,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 10
    }
)

process.crawl(JsonSpider)
process.start()

2022-10-07 13:19:43 [scrapy.utils.log] INFO: Scrapy 2.6.3 started (bot: scrapybot)
2022-10-07 13:19:43 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.4, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.10.2 (v3.10.2:a58ebcc701, Jan 13 2022, 14:50:16) [Clang 13.0.0 (clang-1300.0.29.30)], pyOpenSSL 22.1.0 (OpenSSL 3.0.5 5 Jul 2022), cryptography 38.0.1, Platform macOS-12.5.1-x86_64-i386-64bit
2022-10-07 13:19:43 [scrapy.crawler] INFO: Overridden settings:
{'CONCURRENT_REQUESTS_PER_DOMAIN': 10, 'DOWNLOAD_DELAY': 1}
2022-10-07 13:19:43 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-10-07 13:19:43 [scrapy.extensions.telnet] INFO: Telnet Password: 18e7202320b4920a
2022-10-07 13:19:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2022-10-07 13:19:43 [

### Validate JSON-LD Files

In [3]:
### dev ### 

# clear destination dir
files = glob.glob("./valid_json/*")
for file in files:
    os.remove(file)
    
### /dev ###

# scraped JSON-LD files 
scraped_json_files = glob.glob("./raw_json/*.json")

# SHACL file to validate data against
shacl_file = './court-data-standard-shacl.ttl'

errors = []

def validate_json(scraped_json_files, shacl_file):
    """
    validate scraped json files
    if valid, move to valid_json folder
    if not, append error msg to errors
    """

    for file in scraped_json_files:
        try:
            r = validate(file,
                         shacl_graph=shacl_file,
                         inference='none',
                         abort_on_first=True,
                         allow_infos=False,
                         allow_warnings=False,
                         meta_shacl=False,
                         advanced=True,
                         js=False,
                         debug=False)
            
            # if error, append msg to errors list
            if r[0] != True:
                msg = r[2]
                errors.append(file + "\n" + msg + "\n")
            # otherwise, move file to valid_json folder
            else:
                renamed_file = str(file.split(".")[-2] + ".json")
                file.replace("./raw_json/","")
                os.rename(f"{file}", f"./valid_json/{renamed_file}")
                
        except json.JSONDecodeError:
            errors.append(file + "\nBad JSON format. Validation aborted.")
            pass
    
    print(*errors, sep="\n") if errors else print("All files successfully validated.")

validate_json(scraped_json_files, shacl_file)

  re_matcher = re.compile(re_pattern, re_flags)



All files successfully validated.


### Store JSON-LD Data in RDF... Store

In [5]:
registerplugins()

valid_json_files = glob.glob("./valid_json/*.json")
SQLALCHEMY_URL = "sqlite:///db/court_data.db"

store = plugin.get("SQLAlchemy", Store)(identifier="court_data_store")
graph = Graph(store, identifier="court_data_graph")
graph.open(SQLALCHEMY_URL, create=True)

files = []
for file in valid_json_files:
    if os.path.isfile("court-data.db") == "I'm a bird":
        graph.open(SQLALCHEMY_URL, create=True)
        graph.parse(file)
        graph.close()
    else:
        graph.open(SQLALCHEMY_URL)
        graph.parse(file)
        graph.close()

graph.open(SQLALCHEMY_URL)        
result = graph.query("select * where {?s ?p ?o} limit 10")    
for subject, predicate, object_ in result:
    print(subject, predicate, object_)

graph.close()

State of Illinois Circuit Court http://www.illinoiscourts.gov/Circuit http://schema.org/name
http://localhost:8000/court-data-definitions.jsonld#CourtSystem http://www.illinoiscourts.gov/Circuit http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.illinoiscourts.gov/Circuit#Circuit1District1 http://www.illinoiscourts.gov/Circuit#Circuit4 http://schema.org/areaServed
http://www.illinoiscourts.gov/Circuit#Circuit4 http://www.illinoiscourts.gov/Circuit http://schema.org/areaServed
State of Illinois Circuit 1 http://www.illinoiscourts.gov/Circuit#Circuit4 http://schema.org/name
http://schema.org/AdministrativeArea http://www.illinoiscourts.gov/Circuit#Circuit4 http://www.w3.org/1999/02/22-rdf-syntax-ns#type
State of Illinois Circuit 1 District 1 http://www.illinoiscourts.gov/Circuit#Circuit1District1 http://schema.org/name
http://schema.org/AdministrativeArea http://www.illinoiscourts.gov/Circuit#Circuit1District1 http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.illinoiscou