In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib

## Townhall data

In [2]:
# each page has a summary table that rolls up results at the state level
# get rid of it
def cond(x):
    if x:
        return x.startswith("table ec-table") and not "table ec-table ec-table-summary" in x
    else:
        return False

In [3]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

# headers for csv export
data = [['state_abbr', 'county_name', 'per_reporting', 'candidate', 'party', 'votes_total', '% Won']]

In [4]:
# loop through each state's web page http://townhall.com/election/2016/president/%s/county, where %s is the state abbr
for state in states:
    r = urllib.urlopen('http://townhall.com/election/2016/president/' + state + '/county').read()
    soup = BeautifulSoup(r, "html.parser")

    # loop through each <table> tag with .ec-table class
    tables = soup.findAll('table', attrs={'class':cond})

    for table in tables:
        if table.findParent("table") is None:
            table_body = table.find('tbody')

            rows = table_body.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                # first tbody tr has four td
                if len(cols) == 4:
                    # strip text from each td
                    divs = cols[0].find_all('div')
                    county = divs[0].text.strip()
                    per_reporting = divs[1].text.strip()
                    candidate = cols[1].text.strip()
                    party = cols[1]['class'][0]
                    votes = int(cols[2].text.strip().replace(',','').replace('-','0'))
                    per_won = cols[3].text.strip()
                # all other tbody tr have three td
                else:
                    candidate = cols[0].text.strip()
                    party = cols[1]['class'][0]
                    votes = int(cols[1].text.strip().replace(',','').replace('-','0'))
                    per_won = cols[2].text.strip()
                    
                #combine each row's results
                rowData = [state,county,per_reporting,candidate,party,votes,per_won]
                data.append(rowData)

In [5]:
townhall = pd.DataFrame(data) # throw results in dataframe
new_header = townhall.iloc[0] #grab the first row for the header
townhall = townhall[1:] #take the data less the header row
townhall.columns = new_header #set the header row as the df header
print(townhall.shape[0])
townhall[townhall['state_abbr'] == 'MD']

14184


Unnamed: 0,state_abbr,county_name,per_reporting,candidate,party,votes_total,% Won
5269,MD,Allegany,100%,Donald Trump,GOP,20025,72.0%
5270,MD,Allegany,100%,Hillary Clinton,DEM,6665,24.0%
5271,MD,Allegany,100%,Gary Johnson,LIB,778,2.8%
5272,MD,Allegany,100%,Jill Stein,GRN,336,1.2%
5273,MD,Anne Arundel,100%,Hillary Clinton,DEM,116074,47.8%
5274,MD,Anne Arundel,100%,Donald Trump,GOP,114509,47.1%
5275,MD,Anne Arundel,100%,Gary Johnson,LIB,9365,3.9%
5276,MD,Anne Arundel,100%,Jill Stein,GRN,2991,1.2%
5277,MD,Baltimore City,99.3%,Hillary Clinton,DEM,178562,85.4%
5278,MD,Baltimore City,99.3%,Donald Trump,GOP,22726,10.9%


In [6]:
# strip out state abbreviations and county names from townhall
townhall_counties = townhall[['state_abbr','county_name']].drop_duplicates().reset_index().drop('index',1)
print('Townhall data has ' + str(townhall_counties.shape[0]) + ' counties')
townhall_counties[townhall_counties['state_abbr'] == 'MD']

Townhall data has 3112 counties


Unnamed: 0,state_abbr,county_name
1163,MD,Allegany
1164,MD,Anne Arundel
1165,MD,Baltimore City
1166,MD,Baltimore County
1167,MD,Calvert
1168,MD,Caroline
1169,MD,Carroll
1170,MD,Cecil
1171,MD,Charles
1172,MD,Dorchester


In [7]:
# combine state and county names
townhall['combined'] = townhall['state_abbr'] + townhall['county_name'].apply(lambda x: x.replace('County','').replace(' ','').lower())
townhall[townhall['state_abbr'] == 'MD']

Unnamed: 0,state_abbr,county_name,per_reporting,candidate,party,votes_total,% Won,combined
5269,MD,Allegany,100%,Donald Trump,GOP,20025,72.0%,MDallegany
5270,MD,Allegany,100%,Hillary Clinton,DEM,6665,24.0%,MDallegany
5271,MD,Allegany,100%,Gary Johnson,LIB,778,2.8%,MDallegany
5272,MD,Allegany,100%,Jill Stein,GRN,336,1.2%,MDallegany
5273,MD,Anne Arundel,100%,Hillary Clinton,DEM,116074,47.8%,MDannearundel
5274,MD,Anne Arundel,100%,Donald Trump,GOP,114509,47.1%,MDannearundel
5275,MD,Anne Arundel,100%,Gary Johnson,LIB,9365,3.9%,MDannearundel
5276,MD,Anne Arundel,100%,Jill Stein,GRN,2991,1.2%,MDannearundel
5277,MD,Baltimore City,99.3%,Hillary Clinton,DEM,178562,85.4%,MDbaltimorecity
5278,MD,Baltimore City,99.3%,Donald Trump,GOP,22726,10.9%,MDbaltimorecity


## Census data

In [8]:
# county_fips data from https://www.census.gov/geo/reference/codes/cou.html
census = pd.read_csv('http://www2.census.gov/geo/docs/reference/codes/files/national_county.txt',sep=',',header=None, dtype=str)
census.columns = ['state_abbr', 'state_fips', 'county_fips', 'county_name', 'fips_class_code']
print(census.shape)
census.head()

(3235, 5)


Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1
3,AL,1,7,Bibb County,H1
4,AL,1,9,Blount County,H1


In [9]:
census[census['state_abbr'] == 'MD']

Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
1193,MD,24,1,Allegany County,H1
1194,MD,24,3,Anne Arundel County,H1
1195,MD,24,5,Baltimore County,H1
1196,MD,24,9,Calvert County,H1
1197,MD,24,11,Caroline County,H1
1198,MD,24,13,Carroll County,H1
1199,MD,24,15,Cecil County,H1
1200,MD,24,17,Charles County,H1
1201,MD,24,19,Dorchester County,H1
1202,MD,24,21,Frederick County,H1


In [10]:
# get state and county fips
#census.columns[[0,1,2,3,4]]
fips_codes_census = census.drop(census.columns[[4]],axis=1)
print(fips_codes_census['county_fips'].count())
fips_codes_census.head()

3235


Unnamed: 0,state_abbr,state_fips,county_fips,county_name
0,AL,1,1,Autauga County
1,AL,1,3,Baldwin County
2,AL,1,5,Barbour County
3,AL,1,7,Bibb County
4,AL,1,9,Blount County


In [11]:
# strip out state abbreviations and state names from census
census_states = census[['state_abbr','state_fips']].drop_duplicates().reset_index().drop('index',1)
# drop US territories
census_states = census_states[(census_states['state_abbr'] != 'AS') & (census_states['state_abbr'] != 'GU') & (census_states['state_abbr'] != 'MP') & (census_states['state_abbr'] != 'PR') & (census_states['state_abbr'] != 'UM') & (census_states['state_abbr'] != 'VI')]
print(str(census_states.shape[0]) + ' states')

51 states


In [12]:
# strip out state abbreviations and county names from census
census_counties = census[['state_abbr','state_fips','county_name','county_fips']].drop_duplicates().reset_index().drop('index',1)
census_counties = census_counties[(census_counties['state_abbr'] != 'AS') & (census_counties['state_abbr'] != 'GU') & (census_counties['state_abbr'] != 'MP') & (census_counties['state_abbr'] != 'PR') & (census_counties['state_abbr'] != 'UM') & (census_counties['state_abbr'] != 'VI')]
print('Census data has ' + str(census_counties.shape[0]) + ' counties')
census_counties[census_counties['state_abbr'] == 'MD']

Census data has 3143 counties


Unnamed: 0,state_abbr,state_fips,county_name,county_fips
1193,MD,24,Allegany County,1
1194,MD,24,Anne Arundel County,3
1195,MD,24,Baltimore County,5
1196,MD,24,Calvert County,9
1197,MD,24,Caroline County,11
1198,MD,24,Carroll County,13
1199,MD,24,Cecil County,15
1200,MD,24,Charles County,17
1201,MD,24,Dorchester County,19
1202,MD,24,Frederick County,21


In [13]:
census_counties['combined'] = census_counties['state_abbr'] + census_counties['county_name'].apply(lambda x: x.replace('County','').replace(' ','').lower())
census_counties[census_counties['state_abbr'] == 'MD']

Unnamed: 0,state_abbr,state_fips,county_name,county_fips,combined
1193,MD,24,Allegany County,1,MDallegany
1194,MD,24,Anne Arundel County,3,MDannearundel
1195,MD,24,Baltimore County,5,MDbaltimore
1196,MD,24,Calvert County,9,MDcalvert
1197,MD,24,Caroline County,11,MDcaroline
1198,MD,24,Carroll County,13,MDcarroll
1199,MD,24,Cecil County,15,MDcecil
1200,MD,24,Charles County,17,MDcharles
1201,MD,24,Dorchester County,19,MDdorchester
1202,MD,24,Frederick County,21,MDfrederick


In [14]:
# join census and townhall data on the 'combined' column
right = townhall.set_index('combined')
left = census_counties.set_index('combined')

combined = left.join(right, lsuffix='', rsuffix='_r')
combined = combined.reset_index().drop('combined',1)
print(combined.shape[0])
combined

13724


Unnamed: 0,state_abbr,state_fips,county_name,county_fips,state_abbr_r,county_name_r,per_reporting,candidate,party,votes_total,% Won
0,AK,02,Aleutians East Borough,013,,,,,,,
1,AK,02,Aleutians West Census Area,016,,,,,,,
2,AK,02,Anchorage Municipality,020,,,,,,,
3,AK,02,Bethel Census Area,050,,,,,,,
4,AK,02,Bristol Bay Borough,060,,,,,,,
5,AK,02,Denali Borough,068,,,,,,,
6,AK,02,Dillingham Census Area,070,,,,,,,
7,AK,02,Fairbanks North Star Borough,090,,,,,,,
8,AK,02,Haines Borough,100,,,,,,,
9,AK,02,Hoonah-Angoon Census Area,105,,,,,,,


In [15]:
# drop irrelevant columns
county_level_combined = combined.drop(combined.columns[[4,5]], axis=1)

county_level_final = county_level_combined.drop_duplicates()
print(county_level_final.shape[0])
county_level_final[county_level_final['state_abbr'] == 'MD']

13724


Unnamed: 0,state_abbr,state_fips,county_name,county_fips,per_reporting,candidate,party,votes_total,% Won
4966,MD,24,Allegany County,001,100%,Donald Trump,GOP,20025,72.0%
4967,MD,24,Allegany County,001,100%,Hillary Clinton,DEM,6665,24.0%
4968,MD,24,Allegany County,001,100%,Gary Johnson,LIB,778,2.8%
4969,MD,24,Allegany County,001,100%,Jill Stein,GRN,336,1.2%
4970,MD,24,Anne Arundel County,003,100%,Hillary Clinton,DEM,116074,47.8%
4971,MD,24,Anne Arundel County,003,100%,Donald Trump,GOP,114509,47.1%
4972,MD,24,Anne Arundel County,003,100%,Gary Johnson,LIB,9365,3.9%
4973,MD,24,Anne Arundel County,003,100%,Jill Stein,GRN,2991,1.2%
4974,MD,24,Baltimore County,005,100%,Hillary Clinton,DEM,189437,56.5%
4975,MD,24,Baltimore County,005,100%,Donald Trump,GOP,131009,39.1%


In [16]:
county_level_combined.to_csv('2016_US_County_Level_Presidential_Results.csv',sep=',')