In [1]:
import numpy as np
import pandas as pd
import datetime
import re
import requests
from bs4 import BeautifulSoup
import warnings

In [2]:
# Newyork:
# https://raw.githubusercontent.com/nychealth/coronavirus-data/master/totals/data-by-modzcta.csv
# City of San Antonio:
# https://cosacovid-cosagis.hub.arcgis.com/datasets/CoSAGIS::covid19-deaths-by-zip-code/about
# Wisconsin:
# https://data.dhsgis.wi.gov/datasets/wi-dhs::covid-19-data-by-zip-code-tabulation-area-v2/about
ny_df = pd.read_csv('Data/Covid Data/newyork-covid19-cases-and-deaths.csv')
ny_df = ny_df[['MODIFIED_ZCTA', 'COVID_CASE_COUNT', 'COVID_DEATH_COUNT']]
ny_df.columns = ['Zipcode', 'Case Counts', 'Death Counts']
display(ny_df)
sa_df = pd.read_csv('Data/Covid Data/san-antonio-covid19-cases-and-deaths.csv')
sa_df = sa_df[['ZIP_CODE', 'Positive','Deaths']]
sa_df.columns = ['Zipcode', 'Case Counts', 'Death Counts']
display(sa_df)
wi_df = pd.read_csv('Data/Covid Data/wisconsin-covid19-cases-and-deaths.csv')
wi_df = wi_df[['GEOID', 'POS_CUM_CP', 'DTH_CUM_CP']]
wi_df.columns = ['Zipcode', 'Case Counts', 'Death Counts']
display(wi_df)
frames = [ny_df, sa_df, wi_df]
df = pd.concat(frames)
df

Unnamed: 0,Zipcode,Case Counts,Death Counts
0,10001,7745,57
1,10002,19171,412
2,10003,13634,84
3,10004,973,2
4,10005,2323,1
...,...,...,...
172,11691,19207,614
173,11692,5068,154
174,11693,3256,63
175,11694,5783,133


Unnamed: 0,Zipcode,Case Counts,Death Counts
0,78002,2712,21
1,78015,2407,11
2,78023,5907,41
3,78069,405,8
4,78073,3006,18
...,...,...,...
61,78259,5505,36
62,78260,7417,29
63,78261,5200,14
64,78263,1305,7


Unnamed: 0,Zipcode,Case Counts,Death Counts
0,54403,6922,90
1,54405,935,24
2,54406,932,7
3,54407,329,1
4,54408,252,3
...,...,...,...
770,54540,186,1
771,54541,391,13
772,54542,82,1
773,54545,165,6


Unnamed: 0,Zipcode,Case Counts,Death Counts
0,10001,7745,57
1,10002,19171,412
2,10003,13634,84
3,10004,973,2
4,10005,2323,1
...,...,...,...
770,54540,186,1
771,54541,391,13
772,54542,82,1
773,54545,165,6


In [3]:
df[df['Zipcode'] == 'ZCTA N/A']

Unnamed: 0,Zipcode,Case Counts,Death Counts
122,ZCTA N/A,39854,408


In [4]:
def scrape(z):
        z = str(z)
        source = requests.get('https://censusreporter.org/profiles/86000US{zip}-{zip}/'.format(zip=z)).text
        soup = BeautifulSoup(source, 'lxml')
        s = soup.findAll('script',type="text/javascript")[1]
        return str(s)

def parse(param, sp):
    param = '"'+param+'", '
    try:
        var = float(re.findall(param+'"values": {"this":\s*([+-]?[0-9]+\.[0-9]+)',sp)[0])
    except:
        var = 0

    return var

def getSocioDem(zipcodes):
    df = pd.DataFrame(columns = [ 'Zipcode', 'Population', 'Median age', 'Under 18(%)', '18 to 64(%)', '65 and over(%)', 'Male(%)', 'Female(%)', 'White(%)', "Black(%)", "Native(%)", "Asian(%)", "Islander(%)", "Two plus(%)", "Hispanic(%)", "Per capita income (USD)", "Median household income (USD)", "Below poverty line(%)",
                                'Mean travel time to work (Minutes)', 'Drove Alone (%)', 'Carpooled (%)', 'Public Transit (%)', 'Bicycle (%)', 'Walked (%)', 'Other (%)', 'Worked at home (%)', 'Number of households', 'Persons per household', 'Married (%)', 'Single (%)',
                                'Number of housing units', 'Occupied housing (%)', 'Vacant housing (%)', 'Owner Occupied (%)', 'Renter Occupied (%)', 'Median housing value',
                                'Moved Since Prev Year(%)', 'Same House Prev Year(%)', 'No Degree(%)', 'High School(%)', 'Some College(%)', "Bachelor's(%)", "Post-grad(%)", 'Foriegn Born Population(%)', 'Europe(%)', 'Asia(%)', 'Africa(%)', 'Oceania(%)', 'Latin America(%)', 'North America(%)'],
                                index = list(range(0,len(zipcodes))))

    i = 0
    fail_zip = []
    for zip in zipcodes:
        print(i, zip)
        try:
            s = scrape(zip)
        except:
            fail_zip.append(zip)
            continue
        var = '"full_geoid": "86000US{z}", "total_population":'.format(z=str(zip))
        population = int(re.findall(var+'\s*([+-]?[0-9]+)',s)[0])

        median_age = parse("Median age",s)
        percent_under18 = parse("Under 18",s)
        percent_18to64 = parse("18 to 64",s)
        percent_65andOver = parse("65 and over",s)

        percent_male = parse("Male",s)
        percent_female = parse("Female",s)

        percent_white = parse("White",s)
        percent_black = parse("Black",s)
        percent_native = parse("Native",s)
        percent_asian = parse("Asian",s)
        percent_islander = parse("Islander",s)
        pecent_two_plus = parse("Two\+",s)
        percent_hispanic = parse("Hispanic",s)

        per_capita = parse("Per capita income", s)
        median_household_income = parse("Median household income",s)

        percent_below_poverty = parse("Persons below poverty line",s)

        mean_travel_time = parse("Mean travel time to work",s)
        drove_alone = parse("Drove alone",s)
        carpooled = parse("Carpooled",s)
        public_transit = parse("Public transit",s)
        bicycle = parse("Bicycle",s)
        walked = parse("Walked",s)
        other = parse("Other",s)
        worked_at_home = parse("Worked at home",s)

        number_of_households = parse("Number of households",s)
        persons_per_household = parse("Persons per household",s)

        married = parse("Married",s)
        single = parse("Single",s)

        number_of_housing_units = parse("Number of housing units",s)
        occupied_housing_units = parse("Occupied",s)
        vacant_housing_units = parse("Vacant",s)
        owner_housing_units = parse("Owner occupied",s)
        renter_housing_units = parse("Renter occupied",s)

        median_value_owner_occupied = parse("Median value of owner-occupied housing units",s)

        moved_since_previous_year = parse("Moved since previous year",s)
        same_house_year_ago = parse("Same house year ago",s)

        no_degree = parse("No degree",s)
        high_school = parse("High school",s)
        some_college = parse("Some college",s)
        bachelors = parse("Bachelor's",s)
        post_grad = parse("Post-grad",s)

        foriegn_born_pop = parse("Foreign-born population",s)
        europe = parse('Europe',s)
        asia = parse('Asia',s)
        africa = parse('Africa',s)
        oceania = parse('Oceania',s)
        latin_america = parse('Latin America',s)
        north_america = parse('North America',s)

        df.iloc[i] = [zip, population, median_age, percent_under18, percent_18to64, percent_65andOver, percent_male, percent_female, percent_white, percent_black, percent_native, percent_asian, percent_islander, pecent_two_plus, percent_hispanic, per_capita, median_household_income, percent_below_poverty, mean_travel_time,
        drove_alone, carpooled, public_transit, bicycle, walked, other, worked_at_home, number_of_households, persons_per_household, married, single, number_of_housing_units, occupied_housing_units, vacant_housing_units, owner_housing_units, renter_housing_units,
        median_value_owner_occupied, moved_since_previous_year, same_house_year_ago, no_degree, high_school, some_college, bachelors, post_grad, foriegn_born_pop, europe, asia, africa, oceania, latin_america, north_america]

        i+=1
    print(fail_zip)
    return df

In [5]:
# z = 60106
# source = requests.get('https://censusreporter.org/profiles/86000US{zip}-{zip}/'.format(zip=z)).text
# soup = BeautifulSoup(source, 'lxml')
# s = soup.findAll('script',type="text/javascript")
# str(s[1])

In [6]:
# scrape social demographic data 
zipcodes = df.Zipcode.tolist()
socio_df = getSocioDem(zipcodes)
socio_df

0 10001
1 10002
2 10003
3 10004
4 10005
5 10006
6 10007
7 10009
8 10010
9 10011
10 10012
11 10013
12 10014
13 10016
14 10017
15 10018
16 10019
17 10021
18 10022
19 10023
20 10024
21 10025
22 10026
23 10027
24 10028
25 10029
26 10030
27 10031
28 10032
29 10033
30 10034
31 10035
32 10036
33 10037
34 10038
35 10039
36 10040
37 10044
38 10065
39 10069
40 10075
41 10128
42 10280
43 10282
44 10301
45 10302
46 10303
47 10304
48 10305
49 10306
50 10307
51 10308
52 10309
53 10310
54 10312
55 10314
56 10451
57 10452
58 10453
59 10454
60 10455
61 10456
62 10457
63 10458
64 10459
65 10460
66 10461
67 10462
68 10463
69 10464
70 10465
71 10466
72 10467
73 10468
74 10469
75 10470
76 10471
77 10472
78 10473
79 10474
80 10475
81 11004
82 11101
83 11102
84 11103
85 11104
86 11105
87 11106
88 11109
89 11201
90 11203
91 11204
92 11205
93 11206
94 11207
95 11208
96 11209
97 11210
98 11211
99 11212
100 11213
101 11214
102 11215
103 11216
104 11217
105 11218
106 11219
107 11220
108 11221
109 11222
110 11223


829 53594
830 53597
831 53598
832 53599
833 53702
834 54411
835 54412
836 54413
837 54414
838 54416
839 54417
840 54418
841 54420
842 54421
843 54422
844 54423
845 54424
846 54425
847 54426
848 53703
849 53704
850 53705
851 53706
852 53711
853 53713
854 53714
855 54499
856 54501
857 54511
858 54512
859 54513
860 54514
861 54515
862 54517
863 54519
864 54520
865 54521
866 54524
867 54525
868 54526
869 54547
870 54548
871 54550
872 54552
873 54554
874 54555
875 54556
876 54557
877 54558
878 54559
879 54560
880 54561
881 54562
882 54563
883 54564
884 54565
885 54566
886 54568
887 54601
888 54603
889 54610
890 53715
891 53716
892 53717
893 53718
894 53719
895 53726
896 53792
897 54619
898 54621
899 54622
900 54623
901 54624
902 54636
903 54637
904 54638
905 54639
906 54641
907 54642
908 54625
909 54626
910 54627
911 54632
912 54634
913 54635
914 54628
915 54629
916 54630
917 54631
918 53206
919 53207
920 53208
921 53209
922 53210
923 53211
924 53212
925 53213
926 53214
927 53215
928 53216


Unnamed: 0,Zipcode,Population,Median age,Under 18(%),18 to 64(%),65 and over(%),Male(%),Female(%),White(%),Black(%),...,Some College(%),Bachelor's(%),Post-grad(%),Foriegn Born Population(%),Europe(%),Asia(%),Africa(%),Oceania(%),Latin America(%),North America(%)
0,10001,25026,36.1,10.98,76.38,12.64,49.01,50.99,54.51,6.14,...,10.28,43.84,28.37,28.71,19.08,49.55,2.25,3.52,21.46,4.13
1,10002,74363,44.8,13.09,63.18,23.74,49.29,50.71,22.16,7.77,...,12.98,23.86,10.65,41.04,5.55,79.01,0.61,1.27,12.85,0.7
2,10003,54671,31.9,7.18,80.16,12.66,48.78,51.22,67.98,5.01,...,9.23,45.16,36.0,18.84,30.11,41.21,1.84,6.09,15.72,5.03
3,10004,3310,38.4,7.37,84.23,8.4,47.34,52.66,66.89,4.5,...,3.48,45.67,48.33,20.36,27.89,53.26,2.97,7.42,2.08,6.38
4,10005,8664,30.4,10.2,88.73,1.06,47.68,52.32,70.16,2.01,...,1.6,51.52,40.15,17.68,36.36,41.91,0.72,9.01,8.49,3.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,54541,1392,45.1,20.55,58.84,20.62,44.4,55.6,80.1,4.17,...,29.82,9.58,6.09,1.22,11.76,88.24,0,0,0,0
1014,54542,500,52.9,13.0,56.2,30.8,50.0,50.0,79.2,0,...,32.74,10.49,4.86,0,0,0,0,0,0,0
1015,54545,859,61.1,11.06,50.99,37.95,51.11,48.89,92.67,0,...,29.62,27.17,16.98,0.7,100.0,0,0,0,0,0
1016,54546,1084,52.5,18.08,57.56,24.35,49.72,50.28,93.82,0,...,42.58,13.16,2.99,0.55,0,100.0,0,0,0,0


In [19]:
socio_df['Zipcode'].isnull().sum()

1