# Scraping for the Philadelphia Bail Bond

This code will scrape data from the Philadelphia Courts, cleans the data, and outputs a CSV file. Future implementation is to have it check pages on its own, but for now manual entry of end page is necessary.

## Import Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from datetime import date 

In [2]:
def main():
    record_date = date.today()
    last_page = get_last_page(str(record_date))
    # This list will hold the scraped data from each page
    scraped_list_per_page = []
    # The current page is 1 and the end page as of now is 3 (this needs to be manually checked)
    curr_page_num, end_page = (1, last_page)
    # Starting at the current page and stopping at the last page of the website
    for curr_page_num in range(end_page):
        # Take the current page number and increament it each iteration
        curr_page_num = 1 + curr_page_num
        # The current webpage stores up to 24 criminal files and we are going through each page by updating the page number in the format
        curr_page = "https://www.courts.phila.gov/NewCriminalFilings/date/default.aspx?search={}&searchdt=&searchtype=&page={}".format(record_date, curr_page_num)
        # Then get the HTML file of the page as text
        source = requests.get(curr_page).text
        # Then create a BeautifulSoup object of the text, this makes pulling data out of HTML files easier
        # To learn more about it read here (https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
        soup = BeautifulSoup(source)
        # After inspecting the source code I noticed the criminal files were listed under this specific div tag
        # The findAll function will grab each criminal file from that page
        list_of_criminal_filings = soup.findAll("div", {"class": "well well-sm"})
        # Then pass the list of all criminal fiilings into the extract_attributes function
        # After the extract_attributes function completes it will return a list of that whole page's scraped criminal
        # filings and then it will continue to the next page and at the end we will have one complete joined list
        scraped_list_per_page = (extract_attributes(list_of_criminal_filings)) + scraped_list_per_page
    # The joined list will then be passed into the create_csv function and converted to CSV
    create_csv(scraped_list_per_page)

In [3]:
# Dynamically get the number of page numbers per record date
def get_last_page(record_date):
    last_page = 1
    link = "https://www.courts.phila.gov/NewCriminalFilings/date/default.aspx?search={}".format(record_date)
    source = requests.get(link).text
    soup = BeautifulSoup(source)
    soup_page = soup.findAll("ul", {"class": "pagination"})
    try:
        last_page = soup_page[0].findAll("li")[-2].text
    except IndexError as error:
        # If the date does not exist raise an error
        soup_check = soup.findAll("p", {"class": "margin-top-20"})
        state = soup_check[0].text
        if "No records found.  Please try again." in state:
            raise ValueError(state)
    return int(last_page)

In [4]:
def extract_attributes(list_of_criminal_filings):
    list_of_criminal_file_scraped = []
    # For each criminal file in the list of criminal filings pass it into the scrape_and_store function
    # Then afterwards return everything to main and it will repeat this cycle for the amount of pages
    for criminal_file in list_of_criminal_filings:
        criminal_file_scraped = scrape_and_store(criminal_file.text)
        list_of_criminal_file_scraped.append(criminal_file_scraped)
    return list_of_criminal_file_scraped

In [5]:
# This is just regex functions that helped me clean the data you can read more about regex here (https://docs.python.org/3/library/re.html)
def scrape_and_store(text):
    hold = text.splitlines()
    defendant_name = re.split('Name (.*?)', hold[3])[-1]
    age = re.split('Age (.*?)', hold[4])[-1]
    address = hold[6]
    city = re.split('\t ', address.split(',')[0])[1]
    state = re.split(" (.*?) ", re.split(",", address)[1])[1]
    zip_code = re.split(" (.*?) ", re.split(",", address)[1])[2]
    docket_number = re.split("Number (.*?)", hold[11])[2]
    filing = re.split(" ", hold[12])
    filing_date = filing[2]
    filing_time = " ".join(filing[3:5])
    charge = re.split("Charge ", hold[13])[1]
    represented = hold[15].strip()
    in_custody = hold[16]
    if len(in_custody) != 1:
        try:
            in_custody = re.split("Custody (.*?)", in_custody)[2]
        except IndexError as error:
            in_custody = ""
    bail_status = re.split("\t(.*?)", hold[-10])[-1]
    bail_datetime = re.split(" ", hold[-9])
    bail_date = bail_datetime[2]
    bail_time = " ".join(bail_datetime[3:5])
    bail_type = re.split(": (.*?)", hold[-8])[-1]
    bail_amount = re.split(": (.*?)", hold[-7])[-1]
    outstanding_bail_amt = re.split(" ", hold[-6])[-1]
    # Return a list of all the attributes
    return [defendant_name, age, city, state, zip_code, docket_number, filing_date, filing_time, charge, represented, in_custody, bail_status, bail_date, bail_time, bail_type, bail_amount, outstanding_bail_amt]

In [6]:
# This function will make the list of lists into a CSV file with Pandas
def create_csv(list_of_criminal_file_scraped):
    df = pd.DataFrame(list_of_criminal_file_scraped)
    df.to_csv("output.csv", index=False, header=["Defendant Name", "Age", "City", "State", "Zip Code", "Docket Number", "Filing Date", "Filing Time", "Charge", "Represented", "In Custody", "Bail Status", "Bail Date", "Bail Time", "Bail Type", "Bail Amount", "Outstanding Bail Amount"])

In [7]:
if __name__ == "__main__":
    main()

ValueError: 
No records found.  Please try again.
