In [1]:
import urllib

from lxml import html


In [2]:
response = urllib.request.urlopen("https://www.paradescommission.org/")

In [3]:
data = html.parse(response)

In [4]:
tables = data.xpath('//table[@class="HomePageTable"]')
tables

[<Element table at 0x7f3944a18d60>,
 <Element table at 0x7f394015e4f0>,
 <Element table at 0x7f3940159090>]

In [5]:
parade = tables[0].xpath("./tr")[0]
parade

<Element tr at 0x7f3940157180>

In [6]:
parade_data = parade.xpath("./td")
parade_data

[]

In [7]:
parade = tables[0].xpath("./tr")[1]
parade

<Element tr at 0x7f39401111d0>

In [8]:
parade_data = parade.xpath("./td")
parade_data

[<Element td at 0x7f3940111770>,
 <Element td at 0x7f39401117c0>,
 <Element td at 0x7f3940111810>,
 <Element td at 0x7f3940111860>,
 <Element td at 0x7f39401118b0>]

In [9]:
parade_detail = {}

In [10]:
parade_detail["date"] = parade_data[0].xpath("./span/text()")[0]

In [11]:
parade_detail["title"] = parade_data[1].xpath("./a/text()")

In [12]:
parade_detail["url"] = urllib.parse.urljoin("https://www.paradescommission.org/", parade_data[1].xpath("./a/@href")[0])

In [13]:
parade_detail["town"] = parade_data[2].xpath("./span/text()")[0]

In [14]:
parade_detail["start_time"] = parade_data[3].xpath("./text()")[0]

In [15]:
parade_detail["determination"] = parade_data[4].xpath("./span/text()")[0]

In [16]:
parade_detail

{'date': '06/08/2022',
 'title': ['**The Royal Scottish Pipe Band Ass...'],
 'url': 'https://www.paradescommission.org/viewparade.aspx?id=79282',
 'town': 'Newcastle',
 'start_time': '17:00',
 'determination': 'N/A'}

In [17]:
response = urllib.request.urlopen(parade_detail["url"])
data = html.parse(response)

In [18]:
tables = data.xpath("//table[contains(@class, 'HomePageTable')]")
tables

[<Element table at 0x7f3940103c20>, <Element table at 0x7f3940103450>]

In [19]:
parade_detail["reference"] = tables[0].xpath("./tr/td/text()")[0]

In [20]:
for row in tables[1].xpath("./tr"):
    title = row.xpath("./th/text()")
    if len(title) > 2:
        continue
    title = title[0]
    data = row.xpath("./td/text()")
    if data:
        data = data[0]
    if title == "Start Time of Outward Route":
        parade_detail["start_time"] = data
    elif title == "Proposed Outward Route":
        parade_detail["proposed_outward_route"] = data
    elif title == "End Time of Outward Route":
        parade_detail["end_time_of_outward_route"] = data
    elif title == "Start Time of Return Route":
        parade_detail["start_time_of_return_route"] = data
    elif title == "Proposed Return Route":
        parade_detail["proposed_return_route"] = data
    elif title == "End Time of Return Route":
        parade_detail["end_time_of_return_route"] = data
    elif title == "Number of Bands":
        parade_detail["number_of_bands"] = data
    elif title == "Bands":
        parade_detail["bands"] = data
    elif title == "Expected Number of Participants":
        parade_detail["expected_participants"] = data
    elif title == "Expected Number of Supporters":
        parade_detail["expected_supporters"] = data

In [21]:
parade_detail

{'date': '06/08/2022',
 'title': ['**The Royal Scottish Pipe Band Ass...'],
 'url': 'https://www.paradescommission.org/viewparade.aspx?id=79282',
 'town': 'Newcastle',
 'start_time': '17:00',
 'determination': 'N/A',
 'reference': 'PAR\\92990',
 'proposed_outward_route': 'Donard Car park',
 'end_time_of_outward_route': '18:00',
 'start_time_of_return_route': ' - ',
 'proposed_return_route': [],
 'end_time_of_return_route': ' - ',
 'number_of_bands': '40',
 'bands': "Drumlough pipe Band,Closkelt Pipe Band,Altnaveigh Memorial Pipe Band,Ballyboley Pipe Band,Bessbrook Crimson Arrow Pipe Band,Broughshane & District Pipe Band,Gransha Pipe Band,Harry Ferguson Memorial Pipe Band,Joseph Forde Memorial Pipe Band,Major Sinclair Memorial Pipe Band,McDonald Memorial Pipe Band,Mountjoy Pipe Band,Sgt Walker Memorial Pipe Band,Tullylagan Pipe Band,Wicklow Pipe Band,McDonald Academy Pipe Band,Augharan Pipe Band,Clontibret pipe band,Cloughfin Pipe Band,Cullybackey Junior Pipe Band,Kildoag Pipe Band,Lisn

In [27]:
import datetime
from dataclasses import dataclass


@dataclass
class ParadeItem:
    reference: str = ""
    date: str = ""
    title: str = ""
    town: str = ""
    start_time: str = ""
    determination: str = ""
    proposed_outward_route: str = ""
    end_time_of_outward_route: str = ""
    start_time_of_return_route: str = ""
    proposed_return_route: str = ""
    end_time_of_return_route: str = ""
    number_of_bands: str = ""
    bands: str = ""
    expected_participants: str = ""
    expected_supporters: str = ""
    determination_document: str = ""
    vehicle_procession: bool = False
    subject_to_covid: bool = False
    is_under_consideration: bool = False
    is_sensitive: bool = False
    is_protest: bool = False
    url: str = ""


def get_parades():
    data = html.parse(urllib.request.urlopen("https://www.paradescommission.org/"))
    tables = data.xpath('//table[@class="HomePageTable"]')
    parades = []
    for parade in tables[0].xpath("./tr"):
        parade_data = parade.xpath("./td")
        if parade_data:
            item = ParadeItem()
            item.date = parade_data[0].xpath("./span/text()")[0]

            title = parade_data[1].xpath("./a/text()")[0]
            if "**" in title:
                item.subject_to_covid = True
                title = title.replace("**", "")

            if "##" in title:
                item.vehicle_procession = True
                title = title.replace("##", "")

            item.title = title
            item.town = parade_data[2].xpath("./span/text()")[0]
            item.start_time = parade_data[3].xpath("./text()")[0]
            determination = parade_data[4].xpath("./span/text()")
            if determination:
                item.determination = determination
            item.url = (
                "https://www.paradescommission.org/"
                + parade_data[1].xpath("./a/@href")[0]
            )
            parades.append(item)
    return parades


def get_parade_detail(parade: ParadeItem):
    data = html.parse(urllib.request.urlopen(parade.url))
    tables = data.xpath("//table[contains(@class, 'HomePageTable')]")

    parade.reference = tables[0].xpath("./tr/td/text()")[0]

    for row in tables[1].xpath("./tr"):
        title = row.xpath("./th/text()")
        if len(title) > 2:
            continue
        title = title[0]
        data = row.xpath("./td/text()")
        if data:
            data = data[0]

        if title == "Start Time of Outward Route":
            parade.start_time = data
        elif title == "Proposed Outward Route":
            parade.proposed_outward_route = data
        elif title == "End Time of Outward Route":
            parade.end_time_of_outward_route = data
        elif title == "Start Time of Return Route":
            parade.start_time_of_return_route = data
        elif title == "Proposed Return Route":
            parade.proposed_return_route = data
        elif title == "End Time of Return Route":
            parade.end_time_of_return_route = data
        elif title == "Number of Bands":
            parade.number_of_bands = data
        elif title == "Bands":
            parade.bands = data
        elif title == "Expected Number of Participants":
            parade.expected_participants = data
        elif title == "Expected Number of Supporters":
            parade.expected_supporters = data
        elif title == "Determination Document":
            try:
                parade.determination_document = row.xpath("./td/a/@href")[0]
            except Exception as e:
                logger.error("error settings determination document")
    return parade
                
parades = get_parades()
results = []
start = datetime.datetime.now()
for parade in parades:
    results.append(get_parade_detail(parade))
print(datetime.datetime.now() - start)

0:01:04.625112


In [28]:
results

[ParadeItem(reference='PAR\\92990', date='06/08/2022', title='The Royal Scottish Pipe Band Ass...', town='Newcastle', start_time='17:00', determination=['N/A'], proposed_outward_route='Donard Car park', end_time_of_outward_route='18:00', start_time_of_return_route=' - ', proposed_return_route=[], end_time_of_return_route=' - ', number_of_bands='40', bands="Drumlough pipe Band,Closkelt Pipe Band,Altnaveigh Memorial Pipe Band,Ballyboley Pipe Band,Bessbrook Crimson Arrow Pipe Band,Broughshane & District Pipe Band,Gransha Pipe Band,Harry Ferguson Memorial Pipe Band,Joseph Forde Memorial Pipe Band,Major Sinclair Memorial Pipe Band,McDonald Memorial Pipe Band,Mountjoy Pipe Band,Sgt Walker Memorial Pipe Band,Tullylagan Pipe Band,Wicklow Pipe Band,McDonald Academy Pipe Band,Augharan Pipe Band,Clontibret pipe band,Cloughfin Pipe Band,Cullybackey Junior Pipe Band,Kildoag Pipe Band,Lisnamulligan Pipe Band,McNeillstown Pipe Band ,Raphoe Pipe Band,Tamlaght O'Crilly Pipe Band,Battlehill Pipe Band,Cl