In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import html5lib
import re
from datetime import datetime

## Get unincorporated area boundaries (Do only once)

from lxml import html 
import time

#Get page of links to each unincorporated area
unincorpurl = 'http://maps.latimes.com/neighborhoods/unincorporated/list/page/1/'
unincorpresponse = requests.get(unincorpurl)
unincorpsoup = BeautifulSoup(unincorpresponse.text, "html.parser")

#Get hyperlinks
alist = unincorpsoup.findAll('a', href=True)

#Loop over each unincorporated area
count = 0
outfile = "data/unincorporated_latimes.txt" #Change to .geojson.  Also fix formatting at a later date
for link in alist:
    if ('Unincorporated' in str(link)):
        print('Working on ', link)
        download_url = 'http://maps.latimes.com/'+ link['href']
        
        pageContent=requests.get(download_url) #Use lxml to be able to scrape javascript.         
        tree = html.fromstring(pageContent.content)
        js=tree.xpath('//*[@id="content"]/div[1]/script/text()') #XPath for the script
        result = re.search('features": \[(.*)]    };    ', str(js[0].replace('\n', ''))) #Extract geoJSON feature
        
        with open(outfile, "a+") as text_file:
            if count < 1 :
                text_file.write('{\n "type": "FeatureCollection", \n "features": [\n') #Untested
            text_file.write(result.group(1)+', \n') #Be sure to go remove the last comma manually and fix the beginning
        time.sleep(1) #pause the code for a sec
        count = count + 1
with open(outfile, "a+") as text_file:
    text_file.write('REMOVE PRECEEDING COMMA \n ] }')
        

#Check file output        
import geopandas as gpd
demog = gpd.read_file('data/unincorporated_latimes.geojson')
demog.plot()

## Daily scrape of LA County Public Health table

In [2]:
#Chose what to scrape
archive = False
titlestring='0515' 

if archive:     
    url = 'data/'+titlestring+'.html' #Uses locally downloaded file from wayback machine to lookup archived site
    soup = BeautifulSoup(open(url),"html.parser")
else:
    url ='http://publichealth.lacounty.gov/media/Coronavirus/locations.htm'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
#    datetime.today().strftime('%Y-%m-%d')
#    titlestring=datetime.today().strftime('%m%d') #Use today's date

In [3]:
tr_elements = soup.find_all('tr', {'class' : 'blue text-white'})[-1].find_next_siblings()

colnames = ['Locations','Total Cases', 'Rate']
#for col in soup.find_all('tr')[0].find_all('td'):
#    colnames.append(col.text.replace('*', ''))
    
df = pd.DataFrame(columns=colnames, index = range(0,len(tr_elements)+1)) #To add in total and Pasadena

row_marker = 0

for row in tr_elements:
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:        
        df.iat[row_marker,column_marker] = column.get_text().replace('--', 'suppressed').replace('City of ', '').replace('Unincorporated - ', '').replace('Los Angeles - ','').replace('*','')
        column_marker += 1
    row_marker+=1

#Add total - currently hardcoded
column_marker =0 
for column in soup.find_all('tr')[5].find_all('td'): #Changed value to 5. Previously 1?

    if (column_marker==0):
        print('Renaming ', column.get_text(), 'to Total')
        df.iat[-1, column_marker] = 'Total'
    else:
        df.iat[-1, column_marker] = column.get_text()
        print(column.get_text())
    column_marker += 1
    

Renaming  Laboratory Confirmed Cases (LCC) to Total
36259



In [4]:
df

Unnamed: 0,Locations,Total Cases,Rate
0,Agoura Hills,33,158.02
1,Alhambra,143,164.89
2,Arcadia,65,112.55
3,Artesia,19,113.13
4,Avalon,0,0
...,...,...,...
338,Willowbrook,107,306.48
339,Wiseburn,9,149.33
340,- Under Investigation,1712,
341,,,


In [5]:
#Add Pasadena back in 
column_marker =0
pasadenapop = 141371.
for column in soup.find_all('tr')[6].find_all('td'):
    print(column.get_text())

- Los Angeles County (excl. LB and Pas)
34440



In [6]:
#Add Pasadena back in 
column_marker =0
pasadenapop = 141371.
for column in soup.find_all('tr')[8].find_all('td'): #Previously 4, change as of May ~15
    if(column_marker <2):
        print(column.get_text())
        df.iat[-2, column_marker] = column.get_text().replace('-','').replace(' ','')
        if(column_marker ==1):
            df.iat[-2, column_marker+1] = int( column.get_text().replace('-','')) /pasadenapop *100000
    else:
        print('Skipping:', column.get_text())
    column_marker += 1
    

- Pasadena
662
Skipping: 


In [7]:
#Show which locations have duplicates - due to Unincorporated or City of LA
grouped = df[df['Locations'].duplicated(keep=False)].groupby('Locations')
df[df['Locations'].duplicated(keep=False)]

Unnamed: 0,Locations,Total Cases,Rate
2,Arcadia,65,112.55
5,Azusa,114,227.82
11,Bradbury,2,187.09
15,Cerritos,83,165.78
16,Claremont,30,82.23
19,Covina,118,240.66
24,Duarte,95,431.5
25,El Monte,299,254.97
29,Glendora,86,162.99
31,Hawthorne,287,323.25


In [8]:
#Failed alt methods:
#[['Total Cases', 'Rate']].sum().head(10)
#.agg({'Total Cases' : 'sum'}).head(10)#, 'Rate' : lambda x: x.iloc[n]})

for name, group in grouped:
#    print('Before', group)
    localsum = 0
    localpop = 0
    firstindex = 0
    for row_index, row in group.iterrows():
        if firstindex==0:
            firstindex = row_index
        if (('suppressed' not in row['Total Cases']) and (int(row['Total Cases']) >0)):
            localsum = localsum + int(row['Total Cases'])
            mypop = int(row['Total Cases'])*100000/float(row['Rate'])
            localpop = localpop+mypop
            
#        print(row_index, 'I have ', row['Total Cases'], ' but my city has ', localsum)
#        print(row_index, row['Rate']) 
    if localsum >0:
        localrate = localsum*100000/localpop #This overestimates the rates in communities whose data are suppressed or have 0 confirmed cases
#        print('ROW INDEX', row_index)
        df.iat[row_index,2] = localrate
        df.iat[firstindex,2] = localrate
    else:
        df.iat[row_index,2] = 0
        df.iat[firstindex,2] = 0
    df.iat[row_index,1] = localsum
    df.iat[firstindex,1] = localsum #Note that different arguments for drop_duplicates could avoid needing to update the firstindex versions

In [9]:
#Check that it updated successfully
df[df['Locations'].duplicated(keep=False)]

Unnamed: 0,Locations,Total Cases,Rate
2,Arcadia,69,104.97
5,Azusa,151,228.918
11,Bradbury,2,187.09
15,Cerritos,83,165.78
16,Claremont,30,82.23
19,Covina,161,244.491
24,Duarte,108,408.408
25,El Monte,299,254.97
29,Glendora,88,164.72
31,Hawthorne,291,318.73


In [10]:
df.drop_duplicates(keep='first', inplace=True)

In [11]:
df.count() 

Locations      320
Total Cases    320
Rate           320
dtype: int64

In [12]:
df#.loc[df['Locations'].str.contains('Under Investigation')]

Unnamed: 0,Locations,Total Cases,Rate
0,Agoura Hills,33,158.02
1,Alhambra,143,164.89
2,Arcadia,69,104.97
3,Artesia,19,113.13
4,Avalon,0,0
...,...,...,...
338,Willowbrook,107,306.48
339,Wiseburn,9,149.33
340,- Under Investigation,1712,
341,Pasadena,662,468.271


In [13]:
#Other cleanup - this is different once the index has been changed:
df.loc[df['Locations'].str.contains('Under Investigation'), 'Locations'] = 'Under Investigation'
df.loc[df['Locations'].str.match('Los Angeles'), 'Locations'] = 'Los Angeles - AGGREGATE'
df.loc[df['Locations'].str.match('Laboratory Confirmed Cases (LCC)'), 'Locations'] = 'Total'

df[df['Locations'].str.contains('AGGREGATE')]

In [14]:
df.set_index('Locations', inplace=True)
df

Unnamed: 0_level_0,Total Cases,Rate
Locations,Unnamed: 1_level_1,Unnamed: 2_level_1
Agoura Hills,33,158.02
Alhambra,143,164.89
Arcadia,69,104.97
Artesia,19,113.13
Avalon,0,0
...,...,...
Willowbrook,107,306.48
Wiseburn,9,149.33
Under Investigation,1712,
Pasadena,662,468.271


#Fix Long Beach and Pasadena
column_marker = 0
for column in soup.find_all('tr')[3].find_all('td'):
    print(df.loc['Long Beach'])
    df.at['Long Beach', colnames[column_marker]] = column.get_text()
    #    df.iat[df.loc['Long Beach'] , column_marker] = column.get_text()
    #    print('Before', df[df['Locations'].str.contains('Long Beach')])
    #    df.iat[df.index(df['Locations'].str.contains('Long Beach')).tolist(), column_marker] = column.get_text()
    column_marker += 1
    #    print('After', df[df['Locations'].str.contains('Long Beach')])
    

In [15]:
longbeachpop = 467354.
df.at['Long Beach',colnames[1]] = soup.find_all('tr')[7].find_all('td')[1].text #Previously 3, not 7
df.at['Long Beach',colnames[2]] = int(soup.find_all('tr')[7].find_all('td')[1].text)/longbeachpop*100000

In [16]:
df.loc['Long Beach']

Total Cases       1157
Rate           247.564
Name: Long Beach, dtype: object

In [17]:
df.loc['Woodland Hills']

Total Cases       136
Rate           199.84
Name: Woodland Hills, dtype: object

In [18]:
df

Unnamed: 0_level_0,Total Cases,Rate
Locations,Unnamed: 1_level_1,Unnamed: 2_level_1
Agoura Hills,33,158.02
Alhambra,143,164.89
Arcadia,69,104.97
Artesia,19,113.13
Avalon,0,0
...,...,...
Wiseburn,9,149.33
Under Investigation,1712,
Pasadena,662,468.271
Total,36259,


In [19]:
#Write out to CSV. Make the format match what covid_la.ipynb expects
df.index.names=['city']
df.rename(columns={'Locations': 'city', 'Total Cases': 'count', 'Rate': 'rate'}).to_csv(
    "./data/covid_"+titlestring+".csv",
    index=True,
    encoding="utf-8"
)
#THERE IS A KNOWN BUG that Under Investigation is dropped in archived scrapes! Must be readded