In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib
from IPython.display import display, display_pretty, Javascript, HTML

# Flow

- [Download and preprocess county-level results](#Townhall-data)
- [Downlaod and preprocess county-level metadata](#Census-data)
- [Combine datasets](#Combine-data)
- [Export county-level results](#Export-data)
- [Visualize](#Visualize)

## Townhall data

In [2]:
# each page has a summary table that rolls up results at the state level
# get rid of it
def cond(x):
    if x:
        return x.startswith("table ec-table") and not "table ec-table ec-table-summary" in x
    else:
        return False

In [3]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DC','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

# headers for csv export
data = [['state_abbr', 'county_name', 'party', 'votes_total']]

In [4]:
# loop through each state's web page http://townhall.com/election/2016/president/%s/county, where %s is the state abbr
for state in states:
    r = urllib.urlopen('http://townhall.com/election/2016/president/' + state + '/county').read()
    soup = BeautifulSoup(r, "html.parser")

    # loop through each <table> tag with .ec-table class
    tables = soup.findAll('table', attrs={'class':cond})

    for table in tables:
        if table.findParent("table") is None:
            table_body = table.find('tbody')

            rows = table_body.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                # first tbody tr has four td
                if len(cols) == 4:
                    # strip text from each td
                    divs = cols[0].find_all('div')
                    county = divs[0].text.strip()
                    party = cols[1]['class'][0]
                    total_votes = int(cols[2].text.strip().replace(',','').replace('-','0'))
                # all other tbody tr have three td
                else:
                    party = cols[1]['class'][0]
                    total_votes = int(cols[1].text.strip().replace(',','').replace('-','0'))
                    
                #combine each row's results
                rowData = [state,county,party,total_votes]
                data.append(rowData)

In [5]:
townhall = pd.DataFrame(data) # throw results in dataframe
new_header = townhall.iloc[0] #grab the first row for the header
townhall = townhall[1:] #take the data less the header row
townhall.columns = new_header #set the header row as the df header
townhall['votes_total'] = townhall['votes_total'].astype('float64')
print(townhall.shape[0])
townhall[townhall['state_abbr'] == 'DC']

14188


Unnamed: 0,state_abbr,county_name,party,votes_total
1415,DC,Washington,DEM,260223.0
1416,DC,Washington,GOP,11553.0
1417,DC,Washington,LIB,4501.0
1418,DC,Washington,DCG,3995.0


In [6]:
# fix for townhall Washington D.C. 'county name' to match census data 'county name'
townhall.loc[townhall['state_abbr'] == 'DC', 'county_name'] = 'District of Columbia'

In [7]:
# strip out state abbreviations and county names from townhall
townhall_counties = townhall[['state_abbr','county_name']].drop_duplicates().reset_index().drop('index',1)
print('Townhall data has ' + str(townhall_counties.shape[0]) + ' counties')
townhall_counties[townhall_counties['state_abbr'] == 'DC']

Townhall data has 3113 counties


Unnamed: 0,state_abbr,county_name
288,DC,District of Columbia


In [8]:
# combine state and county names
townhall['combined'] = townhall['state_abbr'] + townhall['county_name'].apply(lambda x: x.replace('County','').replace(' ','').lower())


townhall[townhall['state_abbr'] == 'DC']

Unnamed: 0,state_abbr,county_name,party,votes_total,combined
1415,DC,District of Columbia,DEM,260223.0,DCdistrictofcolumbia
1416,DC,District of Columbia,GOP,11553.0,DCdistrictofcolumbia
1417,DC,District of Columbia,LIB,4501.0,DCdistrictofcolumbia
1418,DC,District of Columbia,DCG,3995.0,DCdistrictofcolumbia


In [9]:
# return sum of votes by state and county
townhall['total_votes'] = townhall['votes_total'].groupby(townhall['combined']).transform('sum')
townhall_counties = townhall.drop('votes_total',axis=1)

# view dataset by selected state
townhall_counties[townhall_counties['state_abbr'] == 'DC']

Unnamed: 0,state_abbr,county_name,party,combined,total_votes
1415,DC,District of Columbia,DEM,DCdistrictofcolumbia,280272.0
1416,DC,District of Columbia,GOP,DCdistrictofcolumbia,280272.0
1417,DC,District of Columbia,LIB,DCdistrictofcolumbia,280272.0
1418,DC,District of Columbia,DCG,DCdistrictofcolumbia,280272.0


## Census data

In [10]:
# county_fips data from https://www.census.gov/geo/reference/codes/cou.html
census = pd.read_csv('http://www2.census.gov/geo/docs/reference/codes/files/national_county.txt',sep=',',header=None, dtype=str)
census.columns = ['state_abbr', 'state_fips', 'county_fips', 'county_name', 'fips_class_code']
print(census.shape)
census.head()

(3235, 5)


Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
0,AL,1,1,Autauga County,H1
1,AL,1,3,Baldwin County,H1
2,AL,1,5,Barbour County,H1
3,AL,1,7,Bibb County,H1
4,AL,1,9,Blount County,H1


In [11]:
census[census['state_abbr'] == 'DC']

Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code
319,DC,11,1,District of Columbia,H6


In [12]:
# get state and county fips
#census.columns[[0,1,2,3,4]]
fips_codes_census = census.drop(census.columns[[4]],axis=1)
print(fips_codes_census['county_fips'].count())
fips_codes_census.head()

3235


Unnamed: 0,state_abbr,state_fips,county_fips,county_name
0,AL,1,1,Autauga County
1,AL,1,3,Baldwin County
2,AL,1,5,Barbour County
3,AL,1,7,Bibb County
4,AL,1,9,Blount County


In [13]:
# strip out state abbreviations and state names from census
census_states = census[['state_abbr','state_fips']].drop_duplicates().reset_index().drop('index',1)
# drop US territories
census_states = census_states[(census_states['state_abbr'] != 'AS') & (census_states['state_abbr'] != 'GU') & (census_states['state_abbr'] != 'MP') & (census_states['state_abbr'] != 'PR') & (census_states['state_abbr'] != 'UM') & (census_states['state_abbr'] != 'VI')]
print(str(census_states.shape[0]) + ' states')

51 states


In [14]:
# strip out state abbreviations and county names from census
census_counties = census[['state_abbr','state_fips','county_name','county_fips']].drop_duplicates().reset_index().drop('index',1)
census_counties = census_counties[(census_counties['state_abbr'] != 'AS') & (census_counties['state_abbr'] != 'GU') & (census_counties['state_abbr'] != 'MP') & (census_counties['state_abbr'] != 'PR') & (census_counties['state_abbr'] != 'UM') & (census_counties['state_abbr'] != 'VI')]
print('Census data has ' + str(census_counties.shape[0]) + ' counties')
census_counties[census_counties['state_abbr'] == 'DC']

Census data has 3143 counties


Unnamed: 0,state_abbr,state_fips,county_name,county_fips
319,DC,11,District of Columbia,1


In [15]:
census['combined'] = census['state_abbr'] + census['county_name'].apply(lambda x: x.replace('County','').replace(' ','').lower())
census[census['state_abbr'] == 'DC']

Unnamed: 0,state_abbr,state_fips,county_fips,county_name,fips_class_code,combined
319,DC,11,1,District of Columbia,H6,DCdistrictofcolumbia


## Combine data

In [16]:
# join census and townhall data on the 'combined' column
right = townhall.set_index('combined')
left = census.set_index('combined')

combined = left.join(right, lsuffix='', rsuffix='_r')
combined = combined.reset_index()
print('Joined dataset has ' + str(combined.shape[0]) + ' items')

# view data by selected state
combined[combined['state_abbr'] == 'DC']

Joined dataset has 13819 items


Unnamed: 0,combined,state_abbr,state_fips,county_fips,county_name,fips_class_code,state_abbr_r,county_name_r,party,votes_total,total_votes
1443,DCdistrictofcolumbia,DC,11,1,District of Columbia,H6,DC,District of Columbia,DEM,260223.0,280272.0
1444,DCdistrictofcolumbia,DC,11,1,District of Columbia,H6,DC,District of Columbia,GOP,11553.0,280272.0
1445,DCdistrictofcolumbia,DC,11,1,District of Columbia,H6,DC,District of Columbia,LIB,4501.0,280272.0
1446,DCdistrictofcolumbia,DC,11,1,District of Columbia,H6,DC,District of Columbia,DCG,3995.0,280272.0


In [17]:
# return unique dataset
county_level_combined = combined.drop_duplicates()
print('Combined dataset has ' + str(county_level_combined.shape[0]) + ' total items')
                                                   
# return only D and R results
county_level_combined = county_level_combined[(county_level_combined['party'] == 'GOP') | (county_level_combined['party'] == 'DEM')]
print('Filtered dataset has ' + str(county_level_combined.shape[0]) + ' D and R items')

# flatten dataset by adding votes by R and D columns
county_level_combined['votes_dem'] = county_level_combined['votes_total'].where(county_level_combined['party'] == 'DEM',0).astype('float32')
county_level_combined['votes_gop'] = county_level_combined['votes_total'].where(county_level_combined['party'] == 'GOP',0).astype('float32')

# drop party and party-level totals and other columns
county_level_combined.drop(['party','votes_total','fips_class_code','state_abbr_r','county_name_r'], axis=1, inplace=True)
# total_results = county_level_combined.drop(['party','votes_total','fips_class_code','state_abbr_r','county_name_r', 'votes_dem', 'votes_gop'], axis=1, inplace=True)
county_level_combined[county_level_combined['state_abbr'] == 'DC']

Combined dataset has 13819 total items
Filtered dataset has 6012 D and R items


Unnamed: 0,combined,state_abbr,state_fips,county_fips,county_name,total_votes,votes_dem,votes_gop
1443,DCdistrictofcolumbia,DC,11,1,District of Columbia,280272.0,260223.0,0.0
1444,DCdistrictofcolumbia,DC,11,1,District of Columbia,280272.0,0.0,11553.0


In [18]:
# pivot data to consolidate
party_pivot = pd.pivot_table(county_level_combined,index=["combined"],values=["votes_dem","votes_gop"],aggfunc=np.sum)
total_pivot = pd.pivot_table(county_level_combined,index=["combined"],values=["total_votes"],aggfunc=np.mean)

# join party and total pivots
combined_pivot = party_pivot.join(total_pivot, lsuffix='', rsuffix='_r')
print('Joined dataset has ' + str(combined_pivot.shape[0]) + ' items')
combined_pivot

# add percentages for each R and D of total votes
# calculate percentage of total vote per major candidates
combined_pivot['per_dem'] = combined_pivot['votes_dem'] / combined_pivot['total_votes']
combined_pivot['per_gop'] = combined_pivot['votes_gop'] / combined_pivot['total_votes']
combined_pivot

# join pivotted and unpivotted data
right = census.set_index('combined')
# left = combined_pivot.set_index('combined')

county_level_final = combined_pivot.join(right, lsuffix='', rsuffix='_r')
county_level_final = county_level_final.reset_index()
print('Joined dataset has ' + str(county_level_final.shape[0]) + ' items')
county_level_final

# create FIPS columns for visualizations
county_level_final['combined_fips'] = county_level_final['state_fips'].apply(lambda x: x.lstrip('0')) + county_level_final['county_fips']
county_level_final

# drop irrelevant columns
county_level_final = county_level_final.drop(['combined','county_fips','state_fips','fips_class_code'], axis=1)
print('Final dataset has ' + str(county_level_final.shape[0]) + ' items')
county_level_final

# view data by selected state
county_level_final[county_level_final['state_abbr'] == 'DC']

Joined dataset has 3006 items
Joined dataset has 3006 items
Final dataset has 3006 items


Unnamed: 0,votes_dem,votes_gop,total_votes,per_dem,per_gop,state_abbr,county_name,combined_fips
287,260223.0,11553.0,280272.0,0.928466,0.041221,DC,District of Columbia,11001


## Export data

In [19]:
county_level_final.to_csv('2016_US_County_Level_Presidential_Results.csv',sep=',')

## Visualize

In [20]:
%%javascript
// https://github.com/d3/d3/issues/1693#issuecomment-35556356
require.config({
    paths: {
        "d3": "http://d3js.org/d3.v4",
        "d3scalechoromatic": "http://d3js.org/d3-scale-chromatic.v1.min",
        "topojson": "http://d3js.org/topojson.v1.min",
        "d3legend":"https://cdnjs.cloudflare.com/ajax/libs/d3-legend/2.11.0/d3-legend.min"
  },
    shim: {
        "d3scalechoromatic": ["d3.global"],
        "d3-interpolate":["d3.global"]
  }
});

define("d3.global", ["d3"], function(_) {
  this.d3 = _;
});

<IPython.core.display.Javascript object>

In [21]:
display(HTML("""
<style>
  .tract {
    stroke: #777;
    stroke-width: 0.05px;
    pointer-events: all;
  }
  .tract:hover {
    stroke: orange;
    pointer-events: all;

  }
  .tract-border {
    fill: none;
    /*stroke: #777;*/
    stroke-width: 0.05px;
    pointer-events: none;
  }
  @media (max-width: 767px) {
    .label {
      transform: rotate(-45deg) translate(-10px, 25px);
    }
  }
  .tract-border-state {
    fill: none;
    stroke: #333;
    stroke-width: 0.5px;
    pointer-events: none;
  }
  .legend {
    font-family: sans-serif;
    font-size: 10pt;

  }
  .legendTitle {
    font-weight: bold;
    font-size:11pt;
  }
  .background {
    fill:#f5f5f5;
  }
  body {
    background: #f5f5f5;
    /*font-family: sans-serif;*/
  }
  .bar {
    font-family: sans-serif;
    font-size: 8pt;
  }
  #cityLegend {
    font-family: sans-serif;
    text-anchor: middle;
  }
  #details .background {
    fill: #f5f5f5;
    fill-opacity: 0.7;
  }
  svg {
    width:100%;
  }
</style>
<div class="row">
  <div class="col-md-1 col-xs-1"></div>
  <div class="col-md-10 col-xs-10">
    <h2>2016 Presidential General Election Results by County</h2>
    <h4>Click on the map to explore</h4>
  </div>
</div>
<div class="row">
  <div class="col-md-12 col-xs-12">
    <svg ></svg>
  </div>
</div>
<div class="row">
  <div class="col-md-1 col-xs-1"></div>
  <div class="col-md-10 col-xs-10">
    <p>
      By <a href="http://johnguerra.co">John Alexis Guerra Gómez</a>
    </p>

  </div> <!-- col-md-10 -->
  <div class="col-md-1">
  </div>

</div> <!-- .row -->
"""))

In [22]:
%%javascript

require(["d3", "topojson"], function(d3, topojson) {
    var svg = d3.select("svg"),
    width = $(document).width() * 10 / 12,
    height = $(document).height() - 200,
    margin = {
        top: 20,
        bottom: width > 767 ? 20 : 100,
        right: 20,
        left: 0
    },
    centered,
    fmt = d3.format(" >5.2%"),
    errorCount = 0;

svg.attr("width", width)
    .attr("height", height);

function ready(error, us, data) {
    if (error) throw error;


    var dictCities = {};
    data.forEach(function(d) {
        //Parse the percentages
        d["per_gop"] = +(d["per_gop"].slice(0, -1).replace(",", "."));
        d["per_dem"] = +(d["per_dem"].slice(0, -1).replace(",", "."));
        d.result = d["per_dem"] - d["per_gop"];
        d.combined_fips = +d.combined_fips;
        dictCities[d.combined_fips] = d;
    });

    var color = d3.scaleSequential(d3.interpolateRdBu)
        .domain([-1, 1]);

    // Add background
    svg.append("rect")
        .attr("class", "background")
        .attr("width", width)
        .attr("height", height)
        // .on("click", clicked);
        // To allow the zoom back
        // svg.on("click", clicked);
    var zoom = d3.zoom()
        .scaleExtent([1, 15])
        .on("zoom", zoomed);

    svg.style("pointer-events", "all")
        .call(zoom);
    var g = svg.append("g");

    function zoomed() {
        console.log(d3.event.transform);
        g.attr("transform", d3.event.transform);
    };

    var projection = d3.geoAlbersUsa()
        .scale(1280)
        .translate([width / 2, height / 2]);

    var path = d3.geoPath()
        .projection(projection);

    g.selectAll("path")
        .data(topojson.feature(us, us.objects.counties).features)
        .enter().append("path")
        .attr("class", "tract")
        .on("click", clicked)
        .on("mouseover", updateDetails)
        .style("fill", function(d) {
            var city = dictCities[d.id];
            if (city)
                return color(city.result);
            else {
                errorCount++;
                console.log(d.id + " Not found" + " errors = " + errorCount);
                return color(0);
            }
        })
        .attr("d", path)
        .append("title")
        .text(function(d) {
            var city = dictCities[d.id],
                county,
                state;
                
            // var msg = d.id;
            if (city) {
                county = city.county_name;
                state = city.state_abbr;
                var msg = county + ', ' + state + " Difference: " + fmt(city.result);
            }
            return msg;
        });


    // g.append("g")
    //     .attr("class", "counties")
    //   .selectAll("path")
    //     .data(topojson.feature(us, us.objects.counties).features)
    //   .enter().append("path")
    //     .attr("class", "tract-border")
    //     .attr("d", path);

    g.append("path")
        .datum(topojson.mesh(us, us.objects.states, function(a, b) {
            return a !== b;
        }))
        .attr("class", "tract-border-state")
        .attr("d", path);


    // g.append("path")
    //     .datum(topojson.mesh(mapData, mapData.objects.depts, function(a, b) { return a !== b; }))
    //     .attr("class", "tract-border-state")
    //     .attr("d", pathState);

    // The details
    var wScale = d3.scaleLinear()
        .domain([-1, 1])
        .range([-width / 3, width / 3]);
    var details_layer = svg.append("g")
        .attr("id", "details")
        .attr("transform", "translate(" + (width / 2 - 100) + ", 30)");
    details_layer.append("rect")
        .attr("class", "background")
        .attr("transform", "translate(" + (-wScale.range()[1] + 100) + ", -20)")
        .attr("width", wScale.range()[1] * 2 + 70)
        .attr("rx", 5)
        .attr("ry", 5)
        .attr("height", 60);
    details_layer.append("text")
        .attr("id", "cityLegend")
        .text("Difference")
        .attr("transform", "translate(100, 0)");

    var detailsBars = details_layer.selectAll("bar")
        .data([0.4978, -0.5021])
        .enter()
        .append("g")
        .attr("class", "bar");
    detailsBars
        .append("rect")
        .attr("width", 0)
        .attr("height", width > 767 ? 20 : 10)
        .attr("x", 100)
        .attr("y", 10)
        .style("fill", color)
        .transition()
        .duration(500)
        .attr("x", function(d) {
            return d > 0 ? 100 : 100 - wScale(-d);
        })
        .attr("width", function(d) {
            return d > 0 ? wScale(d) : wScale(-d);
        });
    detailsBars.append("text")
        .text(function(d) {
            return (d > 0 ? "" : "Rep ") +
                fmt(d > 0 ? d : -d) +
                (d > 0 ? " Dem" : "");
        })
        .attr("dx", function(d) {
            return d > 0 ? 5 : -5;
        })
        .attr("dy", 24)
        .attr("x", 100)
        .style("text-anchor", function(d) {
            return d > 0 ? "start" : "end";
        })
        .transition()
        .duration(500)
        .attr("x", function(d) {
            return d > 0 ? 100 + wScale(d) : 100 - wScale(-d);
        });



    // The legend
    svg.append("g")
        .attr("class", "legend")
        .attr("transform",
            width > 767 ?
            "translate(" + (width - margin.right - 150) + ",100)" :
            "translate(" + (width / 2 - 100) + "," + (height - 120) + ")"
        );

    var legendLinear = d3.legendColor()
        // .shapeWidth(30)
        .cells(7)
        .orient(width > 767 ? "vertical" : "horizontal")
        .title("Diferencia")
        .labels([
            " 100.00% Dem",
            "  66.67%",
            "  33.33%",
            "   0.00%",
            "  33.33%",
            "  66.67%",
            " 100.00% Rep",
        ].reverse())
        .labelFormat(fmt)
        .ascending(true)
        .labelAlign("end")
        .scale(color);

    svg.select(".legend")
        .call(legendLinear);

    // When clicked, zoom in
    function clicked(d) {
        updateDetails(d);
        var x, y, k;

        // Compute centroid of the selected path
        if (d && centered !== d) {
            // if (d) {
            var centroid = path.centroid(d);
            x = centroid[0];
            y = centroid[1];
            // k = zoom.scaleExtent()[1];
            k = 10;
            centered = d;
        }
        else {
            x = width / 2;
            y = height / 2;
            k = 1;
            centered = null;
        }



        // Manually Zoom
        svg.transition()
            .duration(750)
            .call(zoom.transform, d3.zoomIdentity
                .translate(width / 2, height / 2)
                .scale(k)
                .translate(-x, -y));
    }

    function updateDetails(d) {

        var data = [0.4978, -0.5021],
            name = "Difference " + fmt(data[0] + data[1]),
            state,
            county,
            city;
            
        if (d) {
            city = dictCities[d.id];
            if (city) {
                county = city['county_name'];
                state = city['state_abbr'];
                data = [city["per_dem"], -city["per_gop"]];
                name = county + ', ' + state + " Difference: " + fmt(data[0] + data[1]);
            }
        }
        // console.log(data);
        // console.log(name);
        var detailsBars = details_layer
            .selectAll(".bar")
            .data(data);

        detailsBars.select("rect")
            .transition()
            .duration(500)
            .attr("x", function(d) {
                return d > 0 ? 100 : 100 - wScale(-d);
            })
            .attr("width", function(d) {
                return d > 0 ? wScale(d) : wScale(-d);
            })
            .style("fill", color);

        detailsBars.select("text")
            .text(function(d) {
                return (d > 0 ? "" : "Rep ") +
                    fmt(d > 0 ? d : -d) +
                    (d > 0 ? " Dem" : "")
            })
            .transition()
            .duration(500)
            .attr("x", function(d) {
                return d > 0 ? 100 + wScale(d) : 100 - wScale(-d);
            })


        details_layer.select("#cityLegend").text(name);

    }
}

d3.queue()
    .defer(d3.json, "https://raw.githubusercontent.com/john-guerra/US_Elections_Results/master/us.json")
    .defer(d3.csv, "https://raw.githubusercontent.com/tonmcg/US_Elections_Results/master/2016_US_County_Level_Presidential_Results.csv")
    .await(ready);
    
});

<IPython.core.display.Javascript object>