## Wavelength Task 1 - Data Extraction from British and Irish Legal Information Institute Website

Import required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
from collections import Counter
import spacy
from spacy.tokens import Token, Span, Doc
from legislation_linker import *
from openpyxl import load_workbook

Declare web page url strings and number of results to retrieve.

In [139]:
BASE_URL = "https://www.bailii.org/cgi-bin/lucy_search_1.cgi?query=%22Planning+court%22+AND+%28%22The+Royal+Courts+of+Justice%22+OR+%22Supreme+Court%22+OR+%22Manchester+Civil+Justice+Centre%22%29&datelow=201901&datehigh=202002&sort=date&highlight=1"
HOMEPAGE = 'https://www.bailii.org/'
CGIBIN = 'https://www.bailii.org//cgi-bin/format.cgi?doc='
N_CASES = 50
QUERY_URL = BASE_URL + "&show=" + str(N_CASES)

Define helper functions

In [245]:
def get_soup(url):
    '''Returns a BeautifulSoup object from the provided url string'''
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
    r = requests.get(url,headers=HEADERS)
    soup = BeautifulSoup(r.content, "html.parser")
    return soup

def truncate_href(h_ref):
     return h_ref[:h_ref.find("&query=")]
    
def get_link(url):
    return CGIBIN+url

def get_date_time(date_string):
    '''returns a datetime object from provided date_string'''
    return datetime.strptime(date_string, '%d %B %Y')

def get_authorities(party_list):
    
    auth_list = [re.split(r'[/()](?=\d)',x) for x in party_list if (
        "council" or "borough" or "authority" or "county") in x.lower()]
    return auth_list

def taglist_to_string(tag_list):
    '''
    retrieve the text from each Tag item in a list of Tags and 
    concatenate into a string seperated by spaces
    '''
    return "\n".join([x.text for x in tag_list])

def text_from_tags(tag_list):
    '''
    retrieve the text from each Tag item in a list or resultset of Tags and 
    concatenate into a string seperated by spaces
    '''
    return [x.text for x in tag_list]

def split_party_string(party_string):
    
    if re.search("-and-|- and -",party_string):
        return re.split("-and-|- and -",party_string)
    
    if re.search("\([2-9]\)",party_string):
        m = re.finditer("\([2-9]\)",party_string)
        string_parts = []
        start=0
        cut=0
        for n in m:
            cut = n.span()[0]
            string_parts.append(party_string[start:cut])
            start=cut
        string_parts.append(party_string[cut:len(party_string)])
        return string_parts
    
    else: return [party_string]

def filter_parties(party_tags):
    
    party_list = text_from_tags(party_tags)
    split_list = [item.strip() for x in party_list for item in split_party_string(x)]
    
    auth_strings = ["council","borough","county","authority"]
    auth_list = [x for x in split_list if any([auth in x.lower() for auth in auth_strings])]
    
    if len(auth_list) == 0:
        return "None"
    
    if len(auth_list) == 1:
        return auth_list
    
    return " \n ".join(auth_list)

def get_case_link(tag):
    
    case_link = tag.find_next_sibling("a")
    if not(case_link is None): 
        return get_link(case_link.get("href"))
    
    else: return ""

def filter_cases(tags):
        
    vs_strings = [" v "," v. ","vs"]
    case_tags = [x for x in tags if any([vs_string in x.text.lower() for vs_string in vs_strings])]
    
    if len(case_tags) == 0:
        return "None"
    
    cnt = Counter()
    for res in case_tags:
        cnt[res] += 1

    return "\n ".join([f"{tag.text} //{count} \n : {get_case_link(tag)}" for tag,count in cnt.items()])


Define Spacy / Blackstone model helper functions

In [246]:
#Spacy / Blackstone Helper functions

def ent_has_text(ent,strings):
    '''
    Accepts a string or list of strings and an entity object and will return True
    if the there are any of of the provided strings present in the entity text, 
    otherwise it will return False.
    '''
    if type(strings) == str:
        strings = [strings]
    for tok in ent:
        if tok.lower_ in strings:
            return True
    return False

def add_act_year(ent): 
    '''
    If the given "INSTRUMENT" entity has a left parenthesis as the next token, 
    this function locates the matching right parenthesis and return a SPAN object
    from the start of the original entity object to the closing parenthesis token.
    '''
    if ent[len(ent)].lower_ == "(":
        for t in ent.doc[ent.end:ent.end+10]:
            if t.lower_ == ")":
                end_loc = t.i + 1
                return doc[ent.start:end_loc]
    
    else: return ent

def get_acts_from_doc(doc):
    ''' 
    get all relevant acts and sections from a doc object based on blackstone entity
    entity recognition, returns a string showing the count of each matched occurence
    of a relevant Act and/or Section.
    '''
    results = []
    ents = doc.ents
    
    for idx,e in enumerate(ents):
        if ent_has_text(e,["s.","section"]) and idx < len(ents)-1:
            if e.label_ == "PROVISION" and ents[idx + 1].label_ == "INSTRUMENT":
                start = e.start
                end = ents[idx + 1].end

                if not(add_act_year(doc[start:end]) is None) and (end - start < 15):
                    results.append(add_act_year(doc[start:end]).lower_)
                    e._.processed = True
                    ents[idx+1]._.processed = True
            
            if e.label_ == "PROVISION":
                results.append(e.text)
        
        if ent_has_text(e,"act") and e.label_ == "INSTRUMENT" and e._.processed == False:
            if not(add_act_year(e) is None):
                results.append(add_act_year(e).lower_)
     
    return count_results(results)
    
def get_related_legislation(doc):
    '''
    takes a spacy doc object and applies the Blackstone function extract_legislation_relations 
    from linked_legislation.py to return links between sections and acts as well as web links 
    to information on legislation.gov.uk
    '''
    results = []
    relations = extract_legislation_relations(doc)
    for prov, prov_url, instr, instr_url in relations:
        results.append(f"{prov} | {prov_url} | {instr} | {instr_url}".replace("None |","").replace("| None","").strip()+" ") 
        
    return count_results(results)

def count_results(results):
    ''' 
    takes a list of results and uses a Counter collection object to transform the data into 
    the required form for export to excel '''
    cnt = Counter()
    for res in results:
        cnt[res] += 1

    return "\n ".join([f"{desc} //{count}" for desc,count in cnt.items()])

Create Beautiful Soup object 

In [247]:
soup = get_soup(QUERY_URL)

Create a Dataframe called cases to store scraped information on top 50 cases

In [248]:
cases = pd.DataFrame(columns=['Case URL','Case Date','Week','Case Name',
                              'Cite As','Judge(s)','Case No','Court',
                              'Relevant Local Authorities','Key Cases','Acts','Linked Legislation'])

Append scraped information into cases dataframe from query summary page

In [249]:
items =soup.find_all("li")

for i,item in enumerate(items):
    
    case_name = item.a.text
    case_url = HOMEPAGE + truncate_href(item.a['href'])
    case_date = get_date_time(re.findall('\((\d+ \w+ \d+)\)',case_name)[0])
    
    cases = cases.append({
                  'Case URL': case_url,
                  'Case Date':datetime.timestamp(case_date),
                  'Week': datetime.strftime(case_date, '%YWk%W'),
                  'Case Name' : case_name
                 },ignore_index=True,)

Set up Spacy / blackstone model for extracting relevant sections and acts

In [250]:
nlp = spacy.load("en_blackstone_proto")

Declare a custom Span extension to flag entities as processed

In [251]:
Span.set_extension("processed",default=False,force=True)

Use the stored urls to create a Bautiful Soup object from each detailed html document to scrape further information. 

In [252]:
for i,url in enumerate(cases['Case URL']):
    
    case_soup = get_soup(url)
    
    if case_soup.citation is not None:
        
        cases.loc[i,'Cite As'] = case_soup.citation.text[25:]
        cases.loc[i,'Court'] = " ".join(case_soup.court.strings)
        cases.loc[i,'Judge(s)'] = case_soup.panel.text 
        cases.loc[i,'Case No'] =  case_soup.casenum.text[9:]
        cases.loc[i,'Relevant Local Authorities'] = filter_parties(case_soup.parties.find_all("td", {"align" : "CENTER"}))
        cases.loc[i,'Key Cases'] = filter_cases(case_soup.find_all(["u","i"]))
        
    if case_soup.find("meta", {"content":'Microsoft Word 14 (filtered)'}) is not None:
        
        cover_text = case_soup.find_all("p", {"class": "CoverText"})
        cover_main = case_soup.find("p", {"class": "CoverMain"})
        court_lines = case_soup.find("div", {"class": "WordSection1"}).find_all("p")[2:5]
        parties = case_soup.find_all("a", {"name": re.compile("bkParty")})
        
        cases.loc[i,'Cite As'] = cover_text[0].text[25:]
        cases.loc[i,'Court'] = " ".join(text_from_tags(court_lines))
        cases.loc[i,'Judge(s)'] = ""
        cases.loc[i, 'Case No'] =  cover_text[1].text[9:]
        cases.loc[i,'Relevant Local Authorities'] = chr(10).join(filter_parties(parties))
    
    # create spacy doc object from text on web page 
    doc = nlp("".join(case_soup.stripped_strings))
    
    # get details of relevant acts and sections
    cases.loc[i,'Acts'] = get_acts_from_doc(doc)
    
    # use the Blackstone entity linking model to retrieve linked sections and acts as well as hyperlinks
    try:
        cases.loc[i,'Linked Legislation'] = get_related_legislation(doc)
    except UnboundLocalError:
        cases.loc[i,'Linked Legislation'] = ""

In [253]:
cases.head()

Unnamed: 0,Case URL,Case Date,Week,Case Name,Cite As,Judge(s),Case No,Court,Relevant Local Authorities,Key Cases,Acts,Linked Legislation
0,https://www.bailii.org//cgi-bin/format.cgi?doc...,1580429000.0,2020Wk04,"Gluck v Secretary of State for Housing, Commun...",[2020] EWHC 161 (Admin),The Hon. Mr Justice Holgate,"CO/2292/2019, CO/2293/2019, CO/2302/2019, and ...",IN THE HIGH COURT OF JUSTICE QUEEN'S BENCH DIV...,(2) Crawley Borough Council,James v Secretary of State for Wales [1996] 1 ...,"town and country planning act 1990 (""tcpa 1990...",Town and Country Planning Act 1990 | https://w...
1,https://www.bailii.org//cgi-bin/format.cgi?doc...,1580170000.0,2020Wk04,"Sykes v Secretary of State for Housing, Commun...",[2020] EWHC 112 (Admin),MRS JUSTICE LANG DBE,CO/2382/2019,IN THE HIGH COURT OF JUSTICE QUEEN'S BENCH DIV...,(2) RUNNYMEDE BOROUGH COUNCIL,Bloor Homes East Midlands Ltd v Secretary of S...,section 288 of the town and country planning a...,section 288 | https://www.legislation.gov.uk/u...
2,https://www.bailii.org//cgi-bin/format.cgi?doc...,1579133000.0,2020Wk02,"Bailey, R (on the application of) v St Albans ...",[2020] EWHC 24 (Admin),UPPER TRIBUNAL JUDGE GRUBB(SITTING AS A DEPUTY...,CO/1708/2019,IN THE HIGH COURT OF JUSTICE QUEEN'S BENCH DIV...,ST ALBANS CITY AND DISTRICT COUNCIL,Gladman v Canterbury City Council //2 \n : htt...,section 38(6) of the planning and compulsory p...,Section 70(2) | https://www.legislation.gov.uk...
3,https://www.bailii.org//cgi-bin/format.cgi?doc...,1579046000.0,2020Wk02,Thorpe Hall Leisure Ltd v Secretary of State f...,[2020] EWHC 44 (Admin),SIR DUNCAN OUSELEYSitting as a High Court Judge,CO/2847/2019,IN THE HIGH COURT OF JUSTICE QUEEN'S BENCH DIV...,TENDRING DISTRICT COUNCIL,Landelijke Vereniging tot Behoud van de Wadden...,s288 town and country planning act 1990 //1\n ...,s288 Town and Country Planning Act 1990 //1\n...
4,https://www.bailii.org//cgi-bin/format.cgi?doc...,1576800000.0,2019Wk50,Flynn v London Borough of Southwark [2019] EWH...,[2019] EWHC 3575 (Admin),,CO/807/2019,IN THE HIGH COURT OF JUSTICE QUEEN'S BENCH DIV...,London Borough of Southwark,,section 106 of the town and country planning a...,section 106 | https://www.legislation.gov.uk/u...


In [254]:
with pd.ExcelWriter('Data Extraction.xlsx', mode='a') as writer:  
    cases.to_excel(writer, sheet_name='Data',startrow=4,header=False,index=False)