# Senate

> The Senate is the upper chamber of Congress, composed of 24 senators that serve 6-year terms.

In [None]:
# | default_exp senate
# | export
import datetime
import re
import black
import requests

from pydantic import BaseModel
from typing import Dict, Optional, List
from bs4 import BeautifulSoup, NavigableString
from itertools import islice, chain
from urllib.parse import urljoin, urlencode, parse_qsl, urlparse
from more_itertools import split_when, grouper
from pathlib import Path
from diskcache import Cache
from fastcore.utils import patch
from nbdev.showdoc import show_doc

from legisph.website import Website, NotFoundError, ServerError, Link
from legisph.core import logger

## Senate Bills

Our first task is to extract Senate Bills and their related information from the [Legislative Information System](http://legacy.senate.gov.ph/lis/leg_sys.aspx?congress=19&type=bill). This contains data on senate bills, their sponsors, evolution, and timings of actions from the 13th congress onwards. 

### Base Models

There are some common elements found in the Senate Bill information, such as Senators, Senate Committees, and of course Senate Bills. We create `{pydantic}` models to encapsulate their data and get validation for free:  

#### Senator 

In [None]:
# | export
class Senator(BaseModel):
    """A member of the Senate"""

    name: str

    def __hash__(self):
        return hash(self.name)

    def __eq__(self, other):
        if not isinstance(other, Senator):
            return False
        return self.name == other.name

In [None]:
assert Senator(name="Juan de la Cruz") == Senator(name="Juan de la Cruz")
assert Senator(name="Juan de la Cruz") != Senator(name="Jane de la Cruz")

#### Senate Committee

In [None]:
# | export
class SenateCommittee(BaseModel):
    """A committee in the Senate"""

    name: str
    congress: int

    def __hash__(self):
        return hash(self.name)

    def __eq__(self, other):
        if not isinstance(other, SenateCommittee):
            return False
        return self.name == other.name

In [None]:
assert SenateCommittee(name="Ways and Means", congress=18) == SenateCommittee(
    name="Ways and Means", congress=19
)
assert SenateCommittee(name="Finance", congress=19) != SenateCommittee(
    name="Ways and Means", congress=19
)

#### Senate Bill

The SenateBill is the main model that we'd like to extract and will be the main focus of this part of the analysis. 

In [None]:
# | export
class SenateBill(BaseModel):
    """
    These are general measures, which if passed upon, may become laws.
    A bill is prefixed with S., followed by a number assigned the measure based
    on the order in which it is introduced. The vast majority of legislative
    proposals recommendations dealing with the economy, increasing penalties
    for certain crimes, regulation on commerce and trade, etc., are drafted in
    the form of bills. They also include budgetary appropriation of the
    government and many others. When passed by both chambers in identical
    form and signed by the President or repassed by Congress over a presidential
    veto, they become laws.

    [Source](http://legacy.senate.gov.ph/about/legpro.asp)
    """

    class SenateBillStatus(BaseModel):
        date: datetime.date
        item: str

    class FloorActivity(BaseModel):
        date: datetime.date
        parliamentary_status: str
        senators: Optional[List[Senator]]

    class Subject(BaseModel):
        name: str

    class Vote(BaseModel):
        type: str
        date: datetime.date
        tally: Dict[str, List[Senator]]

    url: str
    congress: int
    billno: str
    congress_text: str
    billno_text: str
    title: str
    long_title: str
    filing_date: datetime.date
    filers: Optional[List[Senator]]
    links: Optional[List[Link]]
    scope: str
    legislative_status: SenateBillStatus
    subjects: Optional[List[Subject]]
    primary_committee: Optional[List[SenateCommittee]]
    secondary_committee: Optional[List[SenateCommittee]]
    committee_reports: Optional[List[Link]]
    sponsors: Optional[List[Senator]]
    cosponsors: Optional[List[Senator]]
    document_certification: Optional[str]
    floor_activity: Optional[List[FloorActivity]]
    votes: Optional[List[Vote]]
    legislative_history: List[SenateBillStatus]

    def __str__(self):
        return f"[{self.billno}] {self.title}"

    def __repr__(self):
        return black.format_str(super().__repr__(), mode=black.Mode())

### Senate Website 

We create a class for the Senate Website from which we can extract various information including Senate Bills and other documents later on. In order to be responsible users of the public senate website, we make sure to cache our page requests so that we don't overload the website whenever we are doing our analysis. 

In [None]:
# | export
class SenateWebsite(Website):
    """
    The Senate Website, accessible at [senate.gov.ph](https://senate.gov.ph/).
    """

    def __init__(self, cache_dir=Path(".cache/senate"), **kwargs):
        super().__init__("senate_requests", cache_dir=cache_dir, **kwargs)
        self.cache = Cache(str(cache_dir / "senate_cache"))
        if hasattr(self, "fetch_bills"):
            self.fetch_bills = self.cache.memoize(ignore=["self"])(self.fetch_bills)

### Fetching Senate Bills

We create a method that:

1. Accesses the "All Information" tab of a senate bill given its congress number and Senate Bill number. This requires first retriving the main page, then sending a post request with some session parameters.
2. Breaks up the HTML source into its component parts, in order to generate a SenateBill object.
3. In cases where server returns 200 response but the body says that there was an internal error on the part of the server, handle that use case accordingly.

In [None]:
# | export
@patch
def fetch_bill(
    self: SenateWebsite,
    congress: int,  # Congress Number
    billno: str,  # Bill number (in the format SBN-XXXX)
) -> requests.Response:
    # Initial parameters
    url = "http://legacy.senate.gov.ph/lis/bill_res.aspx"
    params = {"q": billno, "congress": congress}

    # Initial call to get form session parameters
    resp = self.session.get(url=url, params=params)
    html = BeautifulSoup(resp.text, features="html5lib")

    # Handle error cases
    content_str = html.find("td", {"id": "content"}).text.strip()
    resource = f"Senate Bill {billno} in Congress {congress}"
    match content_str:
        case "Not found.":
            raise NotFoundError(f"{resource} not found", resource)
        case "An error has occured. Exception has been logged.":
            raise ServerError(f"Internal Error for {resource}", resource)

    # Access the form data
    form = html.find(name="form", attrs={"name": "form1"})
    inputs = form.find_all(name="input")
    data = {i["id"]: i.attrs.get("value", "") for i in inputs}

    # Set to fetch all information
    data.update({"__EVENTTARGET": "lbAll", "__EVENTARGUMENT": ""})

    # Fetch bill information
    resp = self.session.post(url=url, params=params, data=data)

    # Break into parts
    html = BeautifulSoup(resp.text, features="html5lib")
    content = html.find("td", attrs={"id": "content"})
    title = list(islice(content.children, 5))
    data = {
        item.find_previous().text.strip(): item
        for item in content.find_all("blockquote", recursive=False)
    }

    # Parse through complex votes table
    votes = data.get("Vote(s)")
    if votes:

        def parse_tally(votes):
            tally = {}
            for vote in votes:
                elems = vote.find_all("td")
                tally = tally | {
                    vote.text.strip(): (
                        [
                            Senator(name=voter.text.strip())
                            for voter in voters.find("blockquote").children
                            if voter.text.strip() != ""
                        ]
                        if vote.text.strip() != "Abstained"
                        else [
                            Senator(name=f"{senator[0]}, {senator[1]}")
                            for senator in grouper(voters.text.strip().split(", "), 2)
                        ]
                    )
                    for vote, voters in zip(
                        elems[0 : (len(elems) // 2)],
                        elems[len(elems) // 2 : len(elems) + 1],
                    )
                }
            return tally

        elems = (e for e in votes.children if not isinstance(e, NavigableString))
        votes = list(split_when(elems, lambda _, y: y.name == "blockquote"))
        votes = [
            SenateBill.Vote(
                type=vote[0].text.split("(")[0].strip(),
                date=datetime.datetime.strptime(
                    vote[0].text.split("(")[1].replace(")", ""), "%m/%d/%Y"
                ),
                tally=parse_tally(vote[1:]),
            )
            for vote in votes
        ]

    # Parse subtitle
    subtitle = title[4].text.strip()
    subtitle = subtitle.split("Filed on ")[1].split(" by")

    # Construct Senate Bill
    bill = SenateBill(
        url=url + "?" + urlencode(params),
        congress=congress,
        billno=billno,
        congress_text=title[0].text.strip(),
        billno_text=title[2].text.strip(),
        title=content.find("div", class_="lis_doctitle").text.strip(),
        long_title=data["Long title"].text.strip(),
        filing_date=datetime.datetime.strptime(subtitle[0], "%B %d, %Y"),
        filers=(
            [
                Senator(name=f"{s[0]}, {s[1]}".strip())
                for s in grouper(subtitle[1].split(", "), 2)
            ]
            if subtitle[1] != ""
            else None
        ),
        links=(
            [
                Link(url=urljoin(url, t["href"]), name=t.text.strip())
                for t in links.find_all("a")
            ]
            if (links := content.find("div", id="lis_download"))
            else None
        ),
        scope=data["Scope"].text.strip(),
        legislative_status=SenateBill.SenateBillStatus(
            date=datetime.datetime.strptime(
                re.findall(r"\((.+)\)", (s := data["Legislative status"].text.strip()))[
                    0
                ],
                "%m/%d/%Y",
            ),
            item=re.findall(r"(.+) \(", s)[0],
        ),
        subjects=(
            [
                SenateBill.Subject(name=subject.text)
                for subject in subjects
                if subject.text != ""
            ]
            if (subjects := data.get("Subject(s)"))
            else None
        ),
        primary_committee=(
            [
                SenateCommittee(name=committee.text, congress=congress)
                for committee in committees.children
                if committee.text != ""
            ]
            if (committees := data.get("Primary committee"))
            else None
        ),
        secondary_committee=(
            [
                SenateCommittee(name=committee.text, congress=congress)
                for committee in committees.children
                if committee.text != ""
            ]
            if (committees := data.get("Secondary committee"))
            else None
        ),
        committee_reports=(
            [
                Link(url=urljoin(url, report["href"]), name=report.text)
                for report in reports.find_all("a")
            ]
            if (reports := data.get("Committee report"))
            else None
        ),
        sponsors=(
            [
                Senator(name=f"{senator[0]}, {senator[1]}")
                for senator in grouper(sponsors.text.split(", "), 2)
            ]
            if (sponsors := data.get("Sponsor(s)"))
            else None
        ),
        cosponsors=(
            [
                Senator(name=f"{senator[0]}, {senator[1]}")
                for senator in grouper(cosponsors.text.split(", "), 2)
            ]
            if (cosponsors := data.get("Co-sponsor(s)"))
            else None
        ),
        document_certification=(
            certification.text
            if (certification := data.get("Document certification"))
            else None
        ),
        floor_activity=(
            [
                SenateBill.FloorActivity(
                    date=datetime.datetime.strptime(cols[0].text.strip(), "%m/%d/%Y"),
                    parliamentary_status=cols[1].text.strip(),
                    senators=(
                        [
                            Senator(name=senator.text.strip())
                            for senator in cols[2].children
                            if senator.text.strip() != ""
                        ]
                        if len(list(cols[2].children)) > 0
                        else None
                    ),
                )
                for cols in [
                    row.find_all("td") for row in floor_activity.find_all("tr")
                ][1:]
            ]
            if (floor_activity := data.get("Floor activity"))
            else None
        ),
        votes=votes,
        legislative_history=[
            SenateBill.SenateBillStatus(
                date=datetime.datetime.strptime(row[0].text.strip(), "%m/%d/%Y"),
                item=row[1].text.strip(),
            )
            for row in (
                row.find_all("td") for row in data["Legislative History"].find_all("tr")
            )
            if len(row) == 2
        ],
    )

    return bill


show_doc(SenateWebsite.fetch_bill)

---

### SenateWebsite.fetch_bill

>      SenateWebsite.fetch_bill (congress:int, billno:str)

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| congress | int | Congress Number |
| billno | str | Bill number (in the format SBN-XXXX) |
| **Returns** | **Response** |  |

We provide this example of SBN-2332 in the 18th congress, which was a particularly complex one, leading to an abstention vote on Third Reading but eventually still being passed into law:

In [None]:
senate = SenateWebsite()
senate.fetch_bill(18, "SBN-2332")

SenateBill(
    url="http://legacy.senate.gov.ph/lis/bill_res.aspx?q=SBN-2332&congress=18",
    congress=18,
    billno="SBN-2332",
    congress_text="18th Congress",
    billno_text="Senate Bill No. 2332",
    title="STATUTORY RAPE (INCREASING THE AGE)",
    long_title="AN ACT INCREASING THE AGE FOR DETERMINING STATUTORY RAPE AND OTHER ACTS OF SEXUAL ABUSE AND EXPLOITATION TO PROTECT CHILDREN, AMENDING FOR THIS PURPOSE ACT NO. 3815, AS AMENDED, ALSO KNOWN AS THE REVISED PENAL CODE, REPUBLIC ACT NO. 8353, ALSO KNOWN AS THE ANTI-RAPE LAW OF 1997, AND REPUBLIC ACT NO. 7610, ALSO KNOWN AS THE SPECIAL PROTECTION OF CHILDREN AGAINST ABUSE, EXPLOITATION AND DISCRIMINATION ACT",
    filing_date=datetime.date(2021, 7, 26),
    filers=[
        Senator(name="Hontiveros, Risa"),
        Senator(name='Zubiri, Juan Miguel "Migz" F.'),
        Senator(name="De Lima, Leila M."),
        Senator(name="Gatchalian, Win"),
        Senator(name="Binay, Maria Lourdes Nancy S."),
        Senator(name="Marc

We then want to extract a series of Senate Bills from a particular congress. The listing is paginated, so we create a generator function that eventually returns all of the bills in that congress:

In [None]:
# | export
from typing import Generator


@patch
def generate_bills(
    self: SenateWebsite,
    congress: int,  # Number of the congress from which to fetch bills
) -> Generator[SenateBill | Exception, None, None]:
    """
    Generator function that eventually produces all bills from a congress.
    """

    page = 1
    while True:
        # Fetch and parse the page
        resp = self.session.get(
            url="http://legacy.senate.gov.ph/lis/leg_sys.aspx",
            params={"type": "bill", "congress": congress, "p": page},
        )
        html = BeautifulSoup(resp.text, features="html5lib")
        bills = [
            dict(parse_qsl(urlparse(a["href"]).query))
            for a in html.find("div", class_="alight").find_all("a")
        ]
        # Extract and yield each bill, handle exceptions
        for b in bills:
            try:
                yield self.fetch_bill(congress=b["congress"], billno=b["q"])
            except (NotFoundError, ServerError) as exc:
                logger.error(exc.message)
                yield exc
            except Exception as exc:
                logger.error(f"Exception at {b['q']}, congress {b['congress']}")
                raise exc
        # Try to find the next button and advance if found
        if html.find("a", string="Next\n"):
            page += 1
        else:
            break


show_doc(SenateWebsite.generate_bills)

---

### SenateWebsite.generate_bills

>      SenateWebsite.generate_bills (congress:int)

Generator function that eventually produces all bills from a congress.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| congress | int | Number of the congress from which to fetch bills |
| **Returns** | **Generator** |  |

In [None]:
from itertools import islice

senate = SenateWebsite()
list(islice(senate.generate_bills(19), 3))

[SenateBill(
     url="http://legacy.senate.gov.ph/lis/bill_res.aspx?q=SBN-1332&congress=19",
     congress=19,
     billno="SBN-1332",
     congress_text="19th Congress",
     billno_text="Senate Bill No. 1332",
     title="CIVIL SERVICE ELIGIBILITY TO GOVERNMENT EMPLOYEES",
     long_title="AN ACT GRANTING CIVIL SERVICE ELIGIBILITY TO GOVERNMENT EMPLOYEES WHOSE STATUS OF APPOINTMENT IS EITHER CASUAL OR CONTRACTUAL, AND WHO HAVE CONTINUOUSLY RENDERED AT LEAST SEVEN (7) YEARS OF EFFICIENT SERVICE",
     filing_date=datetime.date(2022, 9, 19),
     filers=[Senator(name="Marcos, Imee R.")],
     links=[
         Link(
             url="http://legacy.senate.gov.ph/lisdata/3944835844!.pdf",
             name="SBN-1332 (as filed)",
         )
     ],
     scope="National",
     legislative_status=SenateBillStatus(
         date=datetime.date(2022, 9, 21), item="Pending in the Committee"
     ),
     subjects=[
         Subject(name="Civil Service Eligibility"),
         Subject(name="Govern

We then create a simple function to materialize that generator function into the final list of bills. Because we don't expect that congress data will change when it is complete, we memoize this function to disk to increase performance and set the stage for continued pipeline in the future.

In [None]:
# | export
@patch
def fetch_bills(
    self: SenateWebsite, congress: int  # Congress from which to fetch bills
) -> List[SenateBill]:
    return list(self.generate_bills(congress))


show_doc(SenateWebsite.fetch_bills)

---

### SenateWebsite.fetch_bills

>      SenateWebsite.fetch_bills (congress:int)

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| congress | int | Congress from which to fetch bills |
| **Returns** | **List** |  |

In [None]:
senate = SenateWebsite()

In order to retrieve all bills from all congresses, then we'd need to fetch all congress numbers. We retrieve that by scraping the dropdown menu in the LIS.

In [None]:
# | export
@patch
def get_congresses(self: SenateWebsite) -> List[int]:
    resp = requests.get(
        "https://legacy.senate.gov.ph/lis/leg_sys.aspx",
        headers=self.session.headers,
    )
    html = BeautifulSoup(resp.content, features="html5lib")
    congresses = [
        int(dict(parse_qsl(link["href"].split("?")[1]))["congress"])
        for link in html.find(id="div_ChangeCongress").find("ul").find_all("a")
    ]
    return congresses


show_doc(SenateWebsite.get_congresses)

---

### SenateWebsite.get_congresses

>      SenateWebsite.get_congresses ()

In [None]:
senate = SenateWebsite()
senate.get_congresses()

[19, 18, 17, 16, 15, 14, 13]

Finally, fetching all bills requires chaining together all of those generator functions into one very big list of Senate Bills.  

In [None]:
# | export
@patch
def fetch_all_bills(self: SenateWebsite):
    """
    Returns all bills from all congresses.
    """
    congresses = self.get_congresses()
    return list(chain(*(self.fetch_bills(congress) for congress in congresses)))


show_doc(SenateWebsite.fetch_all_bills)

---

### SenateWebsite.fetch_all_bills

>      SenateWebsite.fetch_all_bills ()

Returns all bills from all congresses.

Now that we have all of that infra, we can start extracting the data.

In [None]:
senate = SenateWebsite()
bills = senate.fetch_all_bills()
errors = list(filter(lambda x: isinstance(x, Exception), bills))
print(
    f"""
    {len(bills)} bills attempted.
    {len(errors)} server errors found.
    {len(bills) - len(errors)} bills successfully collected.
    """
)


    19016 bills attempted.
    19 server errors found.
    18997 bills successfully collected.
    


### Parsing Senate Bill Status

In the current legislative status field as well as the Legislative history section, there are free text strings that depic actions being taken on these bills, their dates, and the potential actors. However, they are not particularly well-formatted, so we'll need to do some custom string parsing to get this to work.


In [None]:
senate = SenateWebsite()
bills = list(filter(lambda b: not isinstance(b, Exception), senate.fetch_all_bills()))

In [None]:
# import polars as pl

# status_df = pl.DataFrame(
#     {
#         "status": [
#             *[b.legislative_status.item for b in bills],
#             *list(chain(*[[h.item for h in b.legislative_history] for b in bills])),
#         ]
#     }
# )

# list(
#     (
#         status_df.groupby("status")
#         .agg([pl.count()])
#         .sort("count", reverse=True)
#         .filter(pl.col("count") > 100)
#     )["status"]
# )