Here we'll aquire and prepare all (1375) documents that define Jersey (Channel Islands, UK) law. 

Set the root directory you want to put your data in below:

In [18]:
from config2py import config_getter
import os

rootdir = config_getter('jersey_laws')

# The above will fetch the rootdir from a config file, or from the environment, 
# or ask the user for it (and store it for next time)
# If you prefer to just define it in the code, you can do that too by uncommenting 
# and editing the following line:
# rootdir = '/Users/thorwhalen/Dropbox/py/notebooks/tw/scraping/jersey_laws'


print(f"{rootdir=}")

if not os.path.isdir(rootdir):
    raise NotADirectoryError(f"rootdir ({rootdir}) is not a directory. Please make it!")

rootdir='/Users/thorwhalen/Dropbox/py/notebooks/tw/scraping/jersey_laws'


# Data acquisition

## Get the htmls

In [176]:
import os

law_htmls_rootdir = os.path.join(rootdir, 'law_htmls')

print(f"{law_htmls_rootdir=}")

law_htmls_rootdir='/Users/thorwhalen/Dropbox/py/notebooks/tw/scraping/jersey_laws/law_htmls'


I tried to write an automatic data slurper, but was resisting, so a tiny bit of
**MANUAL WORK NEEDED**:

1. Go to https://www.jerseylaw.je/laws/current/Pages/search.aspx?size=n_500_n
2. Copy the html from the "inspect" tool in your browser.
3. Save it into an `.html` file in a folder called `law_htmls` in the root directory
4. Repeat for each page of results, and save each page to a different file.


## Parsing the htmls to get the info (pdf urls, etc.)

In [177]:
from imbed.mdat.jersey_laws import *

laws_info = get_laws_info(law_htmls_rootdir)

In [178]:
len(laws_info)

1375

In [168]:
laws_info[0]

{'name': 'Access to Justice (Jersey) Law 2019',
 'url': 'https://www.jerseylaw.je/laws/current/Pages/07.025.aspx',
 'ref': '07.025',
 'pdf': 'https://www.jerseylaw.je/laws/current/PDFs/07.025.pdf'}

## Download the pdfs

The cell below will make a `pdfs` store and populate it with anything that is in 
laws_info but not found in the `law_pdfs` folder.

In [181]:
import requests
from dol import Files
import os

pdfs = Files(os.path.join(rootdir, 'law_pdfs'))

# TODO: The following SOMEHOW always things the same 18 pfds are missing. WTF!!!?
def acquire_missing_pdfs(pdfs=pdfs, laws_info=laws_info, *, ask_user_confirmation=True, verbose=True):
    pdf_urls = {x['name'] + '.pdf': x['pdf'] for x in laws_info}
    existing_pdfs = list(pdfs)
    
    # TODO: the two following lines SOMEHOW do not give the same thing. WTF!!!?
    # missing_pdfs = set(pdf_urls) - set(existing_pdfs)
    missing_pdfs = [x for x in pdf_urls if x not in pdfs]
    
    if missing_pdfs:
        n_missing_pdfs = len(missing_pdfs)
        missing_pdf_urls = {pdf_name: pdf_urls[pdf_name] for pdf_name in missing_pdfs}

        if ask_user_confirmation:
            print(f"The following {len(missing_pdfs)} pdfs are missing:")
            print("The first (up to) 5 are:")
            print('\t' + '\n\t'.join(list(missing_pdfs)[:5]))
            if input("Do you want to acquire them? (y/n) ").lower() != 'y':
                print("Aborting (not getting missing pdfs)")
                return missing_pdf_urls
        
        error_pdfs = {}
        
        for i, (pdf_name, pdf_url) in enumerate(missing_pdf_urls.items(), 1):
            if verbose: 
                print(f"({i}/{n_missing_pdfs}): Aquiring {pdf_name}")
            r = requests.get(pdf_url)
            if r.status_code == 200:
                pdfs[pdf_name] = r.content
            else:
                error_pdfs.update({pdf_name: pdf_url})
                if verbose:
                    print(
                        f"----> Failed to get {pdf_name} ", 
                        f"({pdf_url} returned status code: {r.status_code}"
                    )
                    
        return error_pdfs
    else:
        if verbose:
            print("You got all the pdfs (that are present in the htmls)!")

missing_pdf_urls = acquire_missing_pdfs()

The following 59 pdfs are missing:
The first (up to) 5 are:
	Loi (1871) sur le mode d’élection des VingteniersTranslated.pdf
	Loi (1913) pour empêcher l’usage à Jersey, sans autorité, des Armoiries Royales.pdf
	Loi (1885) touchant l’Administration des Marchés PublicsTranslated.pdf
	Loi (1884) sur le prêt sur gages.pdf
	Nouveaux Districts Ecclésiastique de “All Saints” et de “Saint Andrew”.pdf
Aborting (not getting missing pdfs)


### Strange problem I couldn't solve

In [224]:
pdf_urls = {x['name'] + '.pdf': x['pdf'] for x in laws_info}
existing_pdfs = list(pdfs)
a = sorted(set(pdf_urls) - set(existing_pdfs))
b = sorted([x for x in pdf_urls if x not in pdfs])
len(a), len(b)  # These should be the same, but the a list has 59 more items than b. WTF!?

(59, 0)

In [196]:
# Let's look at the first key of a.
# Note that it's in pdf (i.e. in it's keys), but not in the set, or list, of those keys
k = a[0]
print(f"{k in pdfs=}")
print(f"{k in pdfs.keys()=}")
print(f"{k in set(pdfs)=}")
print(f"{k in list(pdfs)=}")

k in pdfs=True
k in pdfs.keys()=True
k in set(pdfs)=False
k in list(pdfs)=False


In [212]:
# I thought maybe pdfs (which is a custom class is lying when it does:

assert pdfs.__contains__(k)

# which really means:
import os
filepath = os.path.join(pdfs.rootdir, k)
assert os.path.exists(filepath)

# but the file exists indeed


In [213]:
# So I thought maybe it's because of the accents in the k values
k

'Amendes en vertu des Règlements triennaux.pdf'

In [219]:
# though
k == k

True

In [221]:
# see this too (should be True, but is False)
s = [k for k in pdfs]
k in s

False

In [223]:
k in set(a)

True

In [215]:
hash(k) in {hash(x) for x in set(pdfs)}

False