In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib

## Townhall data

In [4]:
# each page has a summary table that rolls up results at the state level
# get rid of it
def cond(x):
    if x:
        return x.startswith("table ec-table") and not "table ec-table ec-table-summary" in x
    else:
        return False

In [5]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

# headers for csv export
data = [['state_abbr', 'county_name', 'per_reporting', 'candidate', 'party', 'votes_total', '% Won']]

In [22]:
# loop through each state's web page http://townhall.com/election/2016/president/%s/county, where %s is the state abbr
for state in states:
    r = urllib.urlopen('http://townhall.com/election/2016/president/' + state + '/county').read()
    soup = BeautifulSoup(r, "html.parser")

    # loop through each <table> tag with .ec-table class
    tables = soup.findAll('table', attrs={'class':cond})

    for table in tables:
        if table.findParent("table") is None:
            table_body = table.find('tbody')

            rows = table_body.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                # first tbody tr has four td
                if len(cols) == 4:
                    # strip text from each td
                    divs = cols[0].find_all('div')
                    county = divs[0].text.strip()
                    per_reporting = divs[1].text.strip()
                    candidate = cols[1].text.strip()
                    party = cols[1]['class'][0]
                    votes = int(cols[2].text.strip().replace(',','').replace('-','0'))
                    per_won = cols[3].text.strip()
                # all other tbody tr have three td
                else:
                    candidate = cols[0].text.strip()
                    party = cols[1]['class'][0]
                    votes = int(cols[1].text.strip().replace(',','').replace('-','0'))
                    per_won = cols[2].text.strip()
                    
                #combine each row's results
                rowData = [state,county,per_reporting,candidate,party,votes,per_won]
                data.append(rowData)

In [105]:
townhall = pd.DataFrame(data) # throw results in dataframe
new_header = townhall.iloc[0] #grab the first row for the header
townhall = townhall[1:] #take the data less the header row
townhall.columns = new_header #set the header row as the df header
print(townhall.shape[0])
townhall[townhall['state_abbr'] == 'MD']

60964


Unnamed: 0,state_abbr,county_name,per_reporting,candidate,party,votes_total,% Won
5269,MD,Allegany,100%,Donald Trump,GOP,20025,72.0%
5270,MD,Allegany,100%,Hillary Clinton,DEM,6665,24.0%
5271,MD,Allegany,100%,Gary Johnson,LIB,778,2.8%
5272,MD,Allegany,100%,Jill Stein,GRN,336,1.2%
5273,MD,Anne Arundel,100%,Hillary Clinton,DEM,116074,47.8%
5274,MD,Anne Arundel,100%,Donald Trump,GOP,114509,47.1%
5275,MD,Anne Arundel,100%,Gary Johnson,LIB,9365,3.9%
5276,MD,Anne Arundel,100%,Jill Stein,GRN,2991,1.2%
5277,MD,Baltimore City,99.3%,Hillary Clinton,DEM,178562,85.4%
5278,MD,Baltimore City,99.3%,Donald Trump,GOP,22726,10.9%


In [88]:
# strip out state abbreviations and county names from townhall
townhall_counties = townhall[['state_abbr','county_name']].drop_duplicates().reset_index().drop('index',1)
print('Townhall data has ' + str(townhall_counties.shape[0]) + ' counties')
townhall_counties[townhall_counties['state_abbr'] == 'MD']

Townhall data has 3112 counties


Unnamed: 0,state_abbr,county_name
1163,MD,Allegany
1164,MD,Anne Arundel
1165,MD,Baltimore City
1166,MD,Baltimore County
1167,MD,Calvert
1168,MD,Caroline
1169,MD,Carroll
1170,MD,Cecil
1171,MD,Charles
1172,MD,Dorchester


In [106]:
# combine state and county names
townhall['combined'] = townhall['state_abbr'] + townhall['county_name'].apply(lambda x: x.replace('County','').replace(' ',''))
townhall[townhall['state_abbr'] == 'MD']

Unnamed: 0,state_abbr,county_name,per_reporting,candidate,party,votes_total,% Won,combined
5269,MD,Allegany,100%,Donald Trump,GOP,20025,72.0%,MDAllegany
5270,MD,Allegany,100%,Hillary Clinton,DEM,6665,24.0%,MDAllegany
5271,MD,Allegany,100%,Gary Johnson,LIB,778,2.8%,MDAllegany
5272,MD,Allegany,100%,Jill Stein,GRN,336,1.2%,MDAllegany
5273,MD,Anne Arundel,100%,Hillary Clinton,DEM,116074,47.8%,MDAnneArundel
5274,MD,Anne Arundel,100%,Donald Trump,GOP,114509,47.1%,MDAnneArundel
5275,MD,Anne Arundel,100%,Gary Johnson,LIB,9365,3.9%,MDAnneArundel
5276,MD,Anne Arundel,100%,Jill Stein,GRN,2991,1.2%,MDAnneArundel
5277,MD,Baltimore City,99.3%,Hillary Clinton,DEM,178562,85.4%,MDBaltimoreCity
5278,MD,Baltimore City,99.3%,Donald Trump,GOP,22726,10.9%,MDBaltimoreCity


## Census data

In [2]:
# county_fips data from https://www.census.gov/geo/reference/codes/cou.html
census = pd.read_csv('http://www2.census.gov/geo/docs/reference/codes/files/national_county.txt',sep=',',header=None, dtype=str)
census.columns = ['state_abbr', 'state_fips', 'county_fips', 'county_name', 'fips_class_code']
print(census.shape)
census.head()

(3235, 5)


Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1
3,AL,1,7,Bibb County,H1
4,AL,1,9,Blount County,H1


In [52]:
census[census['state_abbr'] == 'VA']

Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
2820,VA,51,001,Accomack County,H1
2821,VA,51,003,Albemarle County,H1
2822,VA,51,005,Alleghany County,H1
2823,VA,51,007,Amelia County,H1
2824,VA,51,009,Amherst County,H1
2825,VA,51,011,Appomattox County,H1
2826,VA,51,013,Arlington County,H1
2827,VA,51,015,Augusta County,H1
2828,VA,51,017,Bath County,H1
2829,VA,51,019,Bedford County,H1


In [3]:
# get state and county fips
#census.columns[[0,1,2,3,4]]
fips_codes_census = census.drop(census.columns[[4]],axis=1)
print(fips_codes_census['county_fips'].count())
fips_codes_census.head()

3235


Unnamed: 0,state_abbr,state_fips,county_fips,county_name
0,AL,1,1,Autauga County
1,AL,1,3,Baldwin County
2,AL,1,5,Barbour County
3,AL,1,7,Bibb County
4,AL,1,9,Blount County


In [84]:
# strip out state abbreviations and state names from census
census_states = census[['state_abbr','state_fips']].drop_duplicates().reset_index().drop('index',1)
# drop US territories
census_states = census_states[(census_states['state_abbr'] != 'AS') & (census_states['state_abbr'] != 'GU') & (census_states['state_abbr'] != 'MP') & (census_states['state_abbr'] != 'PR') & (census_states['state_abbr'] != 'UM') & (census_states['state_abbr'] != 'VI')]
print(str(census_states.shape[0]) + ' states')

51 states


In [87]:
# strip out state abbreviations and county names from census
census_counties = census[['state_abbr','state_fips','county_name','county_fips']].drop_duplicates().reset_index().drop('index',1)
census_counties = census_counties[(census_counties['state_abbr'] != 'AS') & (census_counties['state_abbr'] != 'GU') & (census_counties['state_abbr'] != 'MP') & (census_counties['state_abbr'] != 'PR') & (census_counties['state_abbr'] != 'UM') & (census_counties['state_abbr'] != 'VI')]
print('Census data has ' + str(census_counties.shape[0]) + ' counties')
census_counties[census_counties['state_abbr'] == 'MD']

Census data has 3143 counties


Unnamed: 0,state_abbr,state_fips,county_name,county_fips
1193,MD,24,Allegany County,1
1194,MD,24,Anne Arundel County,3
1195,MD,24,Baltimore County,5
1196,MD,24,Calvert County,9
1197,MD,24,Caroline County,11
1198,MD,24,Carroll County,13
1199,MD,24,Cecil County,15
1200,MD,24,Charles County,17
1201,MD,24,Dorchester County,19
1202,MD,24,Frederick County,21


In [99]:
census_counties['combined'] = census_counties['state_abbr'] + census_counties['county_name'].apply(lambda x: x.replace('County','').replace(' ',''))
census_counties[census_counties['state_abbr'] == 'MD']

Unnamed: 0,state_abbr,state_fips,county_name,county_fips,combined
1193,MD,24,Allegany County,1,MDAllegany
1194,MD,24,Anne Arundel County,3,MDAnneArundel
1195,MD,24,Baltimore County,5,MDBaltimore
1196,MD,24,Calvert County,9,MDCalvert
1197,MD,24,Caroline County,11,MDCaroline
1198,MD,24,Carroll County,13,MDCarroll
1199,MD,24,Cecil County,15,MDCecil
1200,MD,24,Charles County,17,MDCharles
1201,MD,24,Dorchester County,19,MDDorchester
1202,MD,24,Frederick County,21,MDFrederick


In [114]:
# merge census and townhall data on the 'combined' column
combined = pd.merge(townhall, census_counties, on='combined')
print(combined.shape[0])
combined[combined['state_abbr_y'] == 'MD']

58536


Unnamed: 0,state_abbr_x,county_name_x,per_reporting,candidate,party,votes_total,% Won,combined,state_abbr_y,state_fips,county_name_y,county_fips
23724,MD,Allegany,100%,Donald Trump,GOP,20025,72.0%,MDAllegany,MD,24,Allegany County,001
23725,MD,Allegany,100%,Hillary Clinton,DEM,6665,24.0%,MDAllegany,MD,24,Allegany County,001
23726,MD,Allegany,100%,Gary Johnson,LIB,778,2.8%,MDAllegany,MD,24,Allegany County,001
23727,MD,Allegany,100%,Jill Stein,GRN,336,1.2%,MDAllegany,MD,24,Allegany County,001
23728,MD,Allegany,100%,Donald Trump,GOP,20025,72.0%,MDAllegany,MD,24,Allegany County,001
23729,MD,Allegany,100%,Hillary Clinton,DEM,6665,24.0%,MDAllegany,MD,24,Allegany County,001
23730,MD,Allegany,100%,Gary Johnson,LIB,778,2.8%,MDAllegany,MD,24,Allegany County,001
23731,MD,Allegany,100%,Jill Stein,GRN,336,1.2%,MDAllegany,MD,24,Allegany County,001
23732,MD,Allegany,100%,Donald Trump,GOP,20025,72.0%,MDAllegany,MD,24,Allegany County,001
23733,MD,Allegany,100%,Hillary Clinton,DEM,6665,24.0%,MDAllegany,MD,24,Allegany County,001


In [115]:
# drop irrelevant columns
county_level_combined = combined.drop(combined.columns[[0,1,7]], axis=1)

#reformat column labels
county_level_combined.columns = ['per_reporting','candidate','party','votes_total','per_won','state_abbr','state_fips','county_name','county_fips']
county_level_combined

Unnamed: 0,per_reporting,candidate,party,votes_total,per_won,state_abbr,state_fips,county_name,county_fips
0,100%,Donald Trump,GOP,18110,73.4%,AL,01,Autauga County,001
1,100%,Hillary Clinton,DEM,5908,24.0%,AL,01,Autauga County,001
2,100%,Gary Johnson,IND,538,2.2%,AL,01,Autauga County,001
3,100%,Jill Stein,IND,105,0.4%,AL,01,Autauga County,001
4,100%,Donald Trump,GOP,18110,73.4%,AL,01,Autauga County,001
5,100%,Hillary Clinton,DEM,5908,24.0%,AL,01,Autauga County,001
6,100%,Gary Johnson,IND,538,2.2%,AL,01,Autauga County,001
7,100%,Jill Stein,IND,105,0.4%,AL,01,Autauga County,001
8,100%,Donald Trump,GOP,18110,73.4%,AL,01,Autauga County,001
9,100%,Hillary Clinton,DEM,5908,24.0%,AL,01,Autauga County,001


In [117]:
county_level_combined.to_csv('2016_US_County_Level_Presidential_Results.csv',sep=',')