In [3]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
# Create function to get HTML from URL
def simple_get(url):
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
            
    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    print(e)

In [5]:
page = get('https://www.lacourt.org/judicialofficers/ui/SearchResult.aspx')
# dir(page)

In [6]:
page.status_code # if starts with 2, GET was successful, if 4 or 5, it's an error
page.content # provides content of page
page.headers

{'Cache-Control': 'private', 'Content-Type': 'text/html; charset=utf-8', 'Server': 'Microsoft-IIS/7.0', 'Set-Cookie': 'ASP.NET_SessionId=gtxhury4fpoiljqyxugk0qay; path=/; HttpOnly', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'Access-Control-Allow-Headers': 'Content-Type', 'Access-Control-Allow-Methods': 'GET,POST,PUT', 'Access-Control-Allow-Origin': 'https://crmportal.lacourt.org', 'Access-Control-Max-Age': '86400', 'Date': 'Tue, 18 Dec 2018 21:25:00 GMT', 'Content-Length': '380467'}

In [7]:
raw_html = simple_get('https://www.lacourt.org/judicialofficers/ui/SearchResult.aspx')
print(len(raw_html))

380453


In [8]:
# creaate beautifulsoup object from html
soup = BeautifulSoup(raw_html)

In [9]:
# The prettify method allows us to print out the content of the HTML in a nicely
# formatted manner
soup.prettify()

'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml">\n <head id="head1">\n  <title>\n   Judicial Officers - Contacts and Locations - LA Court\n  </title>\n  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n  <link href="../commonv4/images/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>\n  <link href="../commonv4/images/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>\n  <!-- Stylesheet -->\n  <link href="../commonv4/styles/sitemaster.css" id="Link5" media="all" rel="stylesheet" type="text/css"/>\n  <link href="../commonv4/styles/fonts.css" id="Link4" media="all" rel="stylesheet" type="text/css"/>\n  <link href="../commonv4/styles/common.css" id="Link3" media="all" rel="stylesheet" type="text/css"/>\n  <link href="../commonv4/styles/wrapper.css" id="Link6" media="all" rel="stylesheet" type="text/css"/>\n  <link href="../commonv4/styles/breadcrumb.css" id="Link7" media="all" rel="stylesheet" type="text/css"/>\n  <link href="../co

In [10]:
list(soup.children)
print('# children:', len(list(soup.children))) # view children of soup object, only 3 children
[type(item) for item in soup.children] # we have a Doctype object, a Tag object, and a NavigableString object
# Doctype object = which contains information about the type of the document, here it is html
# Tag object = contains other nested tags. The Tag object allows us to navigate through an HTML document, and 
#              extract other tags and text. 
# NavigableString object = text found in HTML document

# children: 3


[bs4.element.Doctype, bs4.element.Tag, bs4.element.NavigableString]

In [11]:
# We can look at the children of our Tag object
parent_tag = list(soup.children)[1]
[type(item) for item in list(parent_tag.children)] # look at class of grandchildren like we did above

[bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

In [12]:
# Use find_all method  to find all instances of a tag at once.
# Find_all method using 'table' argument tag returns us a ResultSet object
# This returns a list of Tag objects that match the parameters. 
type(soup.find_all(name = 'table'))
len(soup.find_all(name = 'table')) # there is only 1 table on this page

1

In [13]:
table = soup.find_all('table')[0] # Grab the first table. there is only one table on this page
print(type(table)) # The table is again a Tag object
len(list(table.children)) # Our table has 561 children, so we have 561 rows

<class 'bs4.element.Tag'>


561

In [14]:
print(type(table.find_all('tr'))) # if we call find_all on 'tr' tag, we get a Resultset object, or a list of Tags
len(list(table.find_all('tr'))) # the object is the same as the table.children call above

<class 'bs4.element.ResultSet'>


561

In [15]:
# getting column names
headrow = table.find_all('thead') # gets first row in body of table
type(headrow) #bs4.element.tag - Represents a found HTML tag with its attributes (e.g. href) and contents
headtuples = headrow[0].find_all('th') # gets all cells in the row
type(headtuples) #bs4.element.ResultSet - A list that keeps track of the SoupStrainer that created it
header_list = []
for item in headtuples:
    header_list.append(item.string)
header_list

['Name', 'Title', 'Location', 'Dept', 'Phone']

In [16]:
row = table.find_all('tr')[1] # gets first row in body of table
type(row) #bs4.element.tag - Represents a found HTML tag with its attributes (e.g. href) and contents
tuples = row.find_all('td') # gets all cells in the row
type(tuples) #bs4.element.ResultSet - A list that keeps track of the SoupStrainer that created it
tuple_list = []
for item in tuples:
    tuple_list.append(item.string)
tuple_list

# find links in row
row.find_all('a') # returns a Resultset list (there should only be one bc only one link in row for courthouse)
if row.find_all('a')[0].has_attr('href'):
    print(row.find_all('a')[0].get('href'))

../../Courthouse/info/CCB


In [17]:
for row in table:
    tuples = row.find_all('td')
    tuple_list = []
    for item in tuples:
        tuple_list.append(item.string)
    print(tuple_list)

[]
['Abzug, Michael D.', 'Judge', 'Clara Shortridge Foltz Criminal Justice Center', '112', '(213) 628-7412']
['Acevedo, Victor M.', 'Commissioner', 'Compton', '007', '(310) 761-7987']
['Aceves, Efrain M.', 'Judge', 'Clara Shortridge Foltz Criminal Justice Center', '046', '(213) 628-7746']
['Adams, Rashida A.', 'Judge', "Edmund D. Edelman Children's Court", '423', '(323) 307-8023']
['Aenlle-Rocha, Fernando L.', 'Judge', 'East Los Angeles', '006', '(323) 780-2011']
['Ahnn, Michelle M.', 'Judge', 'Compton', '004', '(310) 761-4304']
['Alarcon, Gregory W.', 'Judge', 'Stanley Mosk', '036', '(213) 633-0156']
['Amerian, Michael R.', 'Judge', 'Van Nuys - East', 'L', '(818) 901-4625']
['Applegate, Robert P.', 'Judge', 'Burbank', '001', '(818) 260-8401']
['Arakaki, Akemi ', 'Judge', "Edmund D. Edelman Children's Court", '414', '(323) 307-8014']
['Archuleta, Debra R.', 'Judge', 'Michael D. Antonovich Antelope Valley', 'A05', '\xa0']
['Armendariz, Maria Lucy', 'Judge', 'Metropolitan', '074', '(213)

In [18]:

for link in table.find_all('a'):
    print(link.get('href'))

../../Courthouse/info/CCB
../../Courthouse/info/COM
../../Courthouse/info/CCB
../../Courthouse/info/CCJ
../../Courthouse/info/ELA
../../Courthouse/info/COM
../../Courthouse/info/LA
../../Courthouse/info/NW
../../Courthouse/info/BUR
../../Courthouse/info/CCJ
../../Courthouse/info/ATP
../../Courthouse/info/MET
../../Courthouse/info/CCB
../../Courthouse/info/MET
../../Courthouse/info/LA
../../Courthouse/info/NW
../../Courthouse/info/LAV
../../Courthouse/info/ELA
../../Courthouse/info/CCJ
../../Courthouse/info/ATP
../../Courthouse/info/SE
../../Courthouse/info/LA
../../Courthouse/info/LA
../../Courthouse/info/LB
../../Courthouse/info/CCB
../../Courthouse/info/CCB
../../Courthouse/info/SS
../../Courthouse/info/SE
../../Courthouse/info/SBA
../../Courthouse/info/MH
../../Courthouse/info/CCJ
../../Courthouse/info/SBA
../../Courthouse/info/CCJ
../../Courthouse/info/CCJ
../../Courthouse/info/EA
../../Courthouse/info/LA
../../Courthouse/info/ATP
../../Courthouse/info/LAV
../../Courthouse/info/PAS

In [19]:
# create pandas dataframe
df = pd.DataFrame(columns = header_list)

# Add rows to dataframe
for row in table:
    tuples = row.find_all('td')
    tuple_list = []
    for item in tuples:
        tuple_list.append(item.string)
    if len(tuple_list) > 0:
        df_row = pd.Series(tuple_list, index = header_list)
        df = df.append(df_row, ignore_index = True)

df.head()

Unnamed: 0,Name,Title,Location,Dept,Phone
0,"Abzug, Michael D.",Judge,Clara Shortridge Foltz Criminal Justice Center,112,(213) 628-7412
1,"Acevedo, Victor M.",Commissioner,Compton,7,(310) 761-7987
2,"Aceves, Efrain M.",Judge,Clara Shortridge Foltz Criminal Justice Center,46,(213) 628-7746
3,"Adams, Rashida A.",Judge,Edmund D. Edelman Children's Court,423,(323) 307-8023
4,"Aenlle-Rocha, Fernando L.",Judge,East Los Angeles,6,(323) 780-2011


### Get links in table

In [50]:
# find links in row
court_links = []
    
for row in table:
    links = row.find_all('a') # returns a Resultset list (there should only be one bc only one link in row for courthouse)
    if len(links) > 0:
        print(links[0]['href'])

../../Courthouse/info/CCB
../../Courthouse/info/COM
../../Courthouse/info/CCB
../../Courthouse/info/CCJ
../../Courthouse/info/ELA
../../Courthouse/info/COM
../../Courthouse/info/LA
../../Courthouse/info/NW
../../Courthouse/info/BUR
../../Courthouse/info/CCJ
../../Courthouse/info/ATP
../../Courthouse/info/MET
../../Courthouse/info/CCB
../../Courthouse/info/MET
../../Courthouse/info/LA
../../Courthouse/info/NW
../../Courthouse/info/LAV
../../Courthouse/info/ELA
../../Courthouse/info/CCJ
../../Courthouse/info/ATP
../../Courthouse/info/SE
../../Courthouse/info/LA
../../Courthouse/info/LA
../../Courthouse/info/LB
../../Courthouse/info/CCB
../../Courthouse/info/CCB
../../Courthouse/info/SS
../../Courthouse/info/SE
../../Courthouse/info/SBA
../../Courthouse/info/MH
../../Courthouse/info/CCJ
../../Courthouse/info/SBA
../../Courthouse/info/CCJ
../../Courthouse/info/CCJ
../../Courthouse/info/EA
../../Courthouse/info/LA
../../Courthouse/info/ATP
../../Courthouse/info/LAV
../../Courthouse/info/PAS

In [21]:
page2 = get('http://www.lacourt.org/selfhelp/familiesandchildren/SH_FM003.aspx')

In [26]:
soup2 = BeautifulSoup(page2.content)
[type(item) for item in soup2.children]

[bs4.element.Doctype, bs4.element.Tag, bs4.element.NavigableString]

In [29]:
tables = soup2.find_all('table')
[type(table) for table in tables]

[bs4.element.Tag, bs4.element.Tag, bs4.element.Tag]