# 2020 Election Data - Virginia

## Purpose:
* Scrape Virginia Department of Election for 2020 voter preferences by locality

## Dependencies

In [1]:
# Dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import numpy as np

## For Multiple Outputs 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
## Assign website url
vde_url = "https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/Index.html"

In [3]:
## Connect to url
vde_page = requests.get(vde_url)

## Confirm connection
vde_page.status_code

200

In [4]:
## Parse website
vde_page_parsed = BeautifulSoup(vde_page.content, "html.parser")

## Check parsed website
vde_page_parsed

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, maximum-scale=1, minimum-scale=1" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="en-us" http-equiv="Content-Language"/>
<meta content="en-US" name="language"/>
<meta content="max-age=0" http-equiv="cache-control">
<meta content="no-cache" http-equiv="cache-control">
<meta content="0" http-equiv="expires">
<meta content="Tue, 01 Jan 1980 1:00:00 GMT" http-equiv="expires"/>
<meta content="no-cache" http-equiv="pragma"/>
<!--[if IE 7]>
<html class="ie ie7 no-js" lang="en-US">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8 no-js" lang="en-US">
<style type="text/css">

   .header_container { 
       background-color:#FFFFFF;       
    } 
  .stickymenu {
 background-color:#FFFFFF; 
border-bottom:2px solid #CCCCCC;
}

    </style>
<![endif]-->
<!--[if !(I

In [5]:
## Check some "a" tags
vde_tag = vde_page_parsed.find_all("a")

vde_tag


[<a accesskey="c" href="#skipPoint" tabindex="1" title="Skip to Content">Skip to Content</a>,
 <a class="virginiaLogo" href="https://www.virginia.gov/" tabindex="2" title="Virginia.gov portal"><span class="hidden-alt-text">Virginia.gov portal</span></a>,
 <a href="https://www.virginia.gov/government/state-employees/agency-directory" tabindex="3" title="Virginia agency websites">Agencies</a>,
 <a href="https://www.governor.virginia.gov" tabindex="4" title="Virginia Governor">Governor</a>,
 <a accesskey="s" href="https://www.virginia.gov/search" tabindex="5" title="Search Virginia.Gov">Search Virginia.Gov</a>,
 <a href="https://www.elections.virginia.gov/">Home</a>,
 <a href="https://www.elections.virginia.gov/registration/">Registration</a>,
 <a href="https://www.elections.virginia.gov/registration/how-to-register/">How to Register</a>,
 <a href="https://www.elections.virginia.gov/registration/view-your-info/">View Your Info</a>,
 <a href="https://www.elections.virginia.gov/registration

In [6]:
## Extract relevant links
links = set()
for vde_tag in vde_page_parsed.find_all("a"):
    if vde_tag.attrs.get("href") is not None:
        href = vde_tag.attrs.get("href")
        ## Limit hyper links to 'COUNTY' or 'CITY'
        if ("COUNTY" in href or "CITY" in href) and "https:" not in href:
            links.update(["https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/" + href[2:]])

## Check links
links

{'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ACCOMACK_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ALBEMARLE_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ALEXANDRIA_CITY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ALLEGHANY_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/AMELIA_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/AMHERST_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/APPOMATTOX_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ARLINGTON_COUNTY/Index.html',
 'https://results.elec

In [7]:
test_url = list(links)[0]

In [8]:
## Connect to url
page = requests.get(test_url)

## Parse website
soup = BeautifulSoup(page.content, 'html.parser')

soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, maximum-scale=1, minimum-scale=1" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="en-us" http-equiv="Content-Language"/>
<meta content="en-US" name="language"/>
<meta content="max-age=0" http-equiv="cache-control">
<meta content="no-cache" http-equiv="cache-control">
<meta content="0" http-equiv="expires">
<meta content="Tue, 01 Jan 1980 1:00:00 GMT" http-equiv="expires"/>
<meta content="no-cache" http-equiv="pragma"/>
<!--[if IE 7]>
<html class="ie ie7 no-js" lang="en-US">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8 no-js" lang="en-US">
<style type="text/css">

   .header_container { 
       background-color:#FFFFFF;       
    } 
  .stickymenu {
 background-color:#FFFFFF; 
border-bottom:2px solid #CCCCCC;
}

    </style>
<![endif]-->
<!--[if !(I

In [9]:
## Get table information
content = [tr.get_text().split('\n') for tr in soup.find_all("tr")]

## Check content
content[2][1]    
## Check content
content[2][6]

'Donald J. Trump\r'

'80.45%'

In [10]:
def vde_scraper(url):
    '''
    Goal: Obtain all voter percentage of a candidate per locality

    Arguments: Website url (string)

    Returns: Election information per locality from websites (list of lists)
    '''
    ## Connect to url
    page = requests.get(url)

    ## Parse website
    soup = BeautifulSoup(page.content, 'html.parser')

    ## Get country name
    locality = url.replace("https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/","")
    locality = locality.replace("/Index.html","")
    locality = locality.replace("_"," ")
    
    ## Get table information
    content = [tr.get_text().split('\n') for tr in soup.find_all("tr")]

    ## Check if there is no "tr" elements first
    if content is not None:
        ## Create empty list to populate
        vde_content = []
        ## Loop through each object of content
        for i in range(len(content)):
            ## Remove table headers
            if content[i][1] != "Candidate" and content[i][1] != "Response":
                ## Append locality name to table
                content[i].append(locality)
                ## Append table information to empty list
                vde_content.append(content[i])

        ## Return list of list
        return vde_content

In [11]:
vde_scraper(test_url)

[['',
  'Joseph R. Biden\r',
  '                                                \t\t\t\t\t\t',
  'Democratic',
  '',
  '3,110',
  '18.72%',
  '',
  'WISE COUNTY'],
 ['',
  'Donald J. Trump\r',
  '                                                \t\t\t\t\t\t',
  'Republican',
  '',
  '13,366',
  '80.45%',
  '',
  'WISE COUNTY'],
 ['',
  'Jo Jorgensen\r',
  '                                                \t\t\t\t\t\t',
  'Libertarian',
  '',
  '108',
  '0.65%',
  '',
  'WISE COUNTY'],
 ['',
  'Write In\r',
  '                                                \t\t\t\t\t\t',
  'Write-In',
  '',
  '31',
  '0.19%',
  '',
  'WISE COUNTY'],
 ['',
  'Mark R. Warner\r',
  '                                                \t\t\t\t\t\t',
  'Democratic',
  '',
  '4,128',
  '25.41%',
  '',
  'WISE COUNTY'],
 ['',
  'Daniel M. Gade\r',
  '                                                \t\t\t\t\t\t',
  'Republican',
  '',
  '12,096',
  '74.46%',
  '',
  'WISE COUNTY'],
 ['',
  'Write In\r',
  '         

In [12]:
def link_scrape(urls, sleep=3):
    '''
    Goal: Apply vde_scraper function to all urls

    Arguments: Set or List containing website urls

    Returns: Data frame containing each locality page's voter preferences
    '''
    ## Create empty list
    scraped_data = []

    ## Loop through each url
    for url in urls:
        ## Apply function...
        scrape = vde_scraper(url)
        ## If function does not return None...
        if scrape is not None:
            ## Add to empty list
            scraped_data.extend(vde_scraper(url))
        ## Sleep
        time.sleep(random.uniform(0,sleep))
    ## Convert list of lists to data frame
    df = pd.DataFrame(scraped_data)

    return df

## Store data scraper
vde0 = link_scrape(links)

In [13]:
## Check returned data
vde0

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,Joseph R. Biden\r,...,Democratic,,3110,18.72%,,WISE COUNTY
1,,Donald J. Trump\r,...,Republican,,13366,80.45%,,WISE COUNTY
2,,Jo Jorgensen\r,...,Libertarian,,108,0.65%,,WISE COUNTY
3,,Write In\r,...,Write-In,,31,0.19%,,WISE COUNTY
4,,Mark R. Warner\r,...,Democratic,,4128,25.41%,,WISE COUNTY
...,...,...,...,...,...,...,...,...,...
2860,,No\r,...,,,1983,34.92%,,LUNENBURG COUNTY
2861,,Yes\r,...,,,5099,87.40%,,LUNENBURG COUNTY
2862,,No\r,...,,,735,12.60%,,LUNENBURG COUNTY
2863,,Yes\r,...,,,4172,71.13%,,LUNENBURG COUNTY


In [14]:
## Rename variables
vde1 = vde0.rename(columns={0:"empty1",
                1:"Candidate",
                2:"empty2",
                3:"Affiliation",
                4:"empty3",
                5:"Count",
                6:"Percentage",
                7:"empty4",
                8:"Locality"})

## Confirm rename
vde1

Unnamed: 0,empty1,Candidate,empty2,Affiliation,empty3,Count,Percentage,empty4,Locality
0,,Joseph R. Biden\r,...,Democratic,,3110,18.72%,,WISE COUNTY
1,,Donald J. Trump\r,...,Republican,,13366,80.45%,,WISE COUNTY
2,,Jo Jorgensen\r,...,Libertarian,,108,0.65%,,WISE COUNTY
3,,Write In\r,...,Write-In,,31,0.19%,,WISE COUNTY
4,,Mark R. Warner\r,...,Democratic,,4128,25.41%,,WISE COUNTY
...,...,...,...,...,...,...,...,...,...
2860,,No\r,...,,,1983,34.92%,,LUNENBURG COUNTY
2861,,Yes\r,...,,,5099,87.40%,,LUNENBURG COUNTY
2862,,No\r,...,,,735,12.60%,,LUNENBURG COUNTY
2863,,Yes\r,...,,,4172,71.13%,,LUNENBURG COUNTY


In [15]:
## Check values insides empty
vde1['empty1'].unique()
vde1['empty2'].unique()
vde1['empty3'].unique()
vde1['empty4'].unique()

array([''], dtype=object)

array(['                                                \t\t\t\t\t\t'],
      dtype=object)

array([''], dtype=object)

array([''], dtype=object)

In [16]:
## Remove empty columns
vde2 = vde1.drop(columns= ['empty1','empty2','empty3','empty4'])

vde2

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality
0,Joseph R. Biden\r,Democratic,3110,18.72%,WISE COUNTY
1,Donald J. Trump\r,Republican,13366,80.45%,WISE COUNTY
2,Jo Jorgensen\r,Libertarian,108,0.65%,WISE COUNTY
3,Write In\r,Write-In,31,0.19%,WISE COUNTY
4,Mark R. Warner\r,Democratic,4128,25.41%,WISE COUNTY
...,...,...,...,...,...
2860,No\r,,1983,34.92%,LUNENBURG COUNTY
2861,Yes\r,,5099,87.40%,LUNENBURG COUNTY
2862,No\r,,735,12.60%,LUNENBURG COUNTY
2863,Yes\r,,4172,71.13%,LUNENBURG COUNTY


In [17]:
## Remove extra characters in candidate value
vde2['Candidate'] = vde2['Candidate'].str.replace("\r","")

## Check value
vde2['Candidate'].unique()



array(['Joseph R. Biden', 'Donald J. Trump', 'Jo Jorgensen', 'Write In',
       'Mark R. Warner', 'Daniel M. Gade', 'H. Morgan Griffith', 'Yes',
       'No', 'Elaine G. Luria', 'Scott W. Taylor', 'David Bruce Foster',
       'Jacqueline V. Davis', 'Warren W. "Ski" Wisneski',
       'James C. Sturgis', 'Patricia S. Stith', 'Scott E. Berger',
       'Matthew Yancy III', 'Larry E. LeMond',
       'B.B. "Barry" Downing, Jr.', 'Gregory S. Hardesty',
       'Norma P. Spencer', 'Robert K. "Bo" Lewis, Sr.',
       'Eleanor C. Gordon', 'Mary Beth Briggs', 'John D. Crockett',
       'Barbara M. Thomas', 'Douglas J. Coburn Jr.',
       'David A. Philpot III', 'Glenn L. Purvis', 'John W. Hallett, Sr.',
       'Claude F. Jones', 'Elizabeth P. "Liz" Freund', 'Andrea D. Fox',
       'Kimberly R. Fitzpatrick', 'Lisa Lindberg', 'Nicholas A. Betts',
       'Ben L. Cline', 'Rachel A. Carton', 'R.A. "Tony" Robertson',
       'Sharon W. Turner', 'Kenneth G. Bunch', 'B. Cameron Webb',
       'Robert G. "Bob

In [18]:
## Convert percenmtage to float
vde2['percent'] = vde2['Percentage'].str.rstrip('%').astype('float') / 100.0

vde2['percent'].dtypes

dtype('float64')

In [19]:
## Create dummy for Democratic majority in Presidential election
vde2["P_Democrat"] = np.where(((vde2['Candidate'] == "Joseph R. Biden") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['P_Democrat'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat
13,Joseph R. Biden,Democratic,3667,54.47%,NORTHAMPTON COUNTY,0.5447,1
108,Joseph R. Biden,Democratic,8517,66.22%,FREDERICKSBURG CITY,0.6622,1
122,Joseph R. Biden,Democratic,4790,69.59%,WILLIAMSBURG CITY,0.6959,1
214,Joseph R. Biden,Democratic,2397,53.61%,SURRY COUNTY,0.5361,1
228,Joseph R. Biden,Democratic,117393,51.59%,VIRGINIA BEACH CITY,0.5159,1
289,Joseph R. Biden,Democratic,6981,53.74%,STAUNTON CITY,0.5374,1
303,Joseph R. Biden,Democratic,4973,51.94%,PRINCE EDWARD COUNTY,0.5194,1
361,Joseph R. Biden,Democratic,23218,51.55%,MONTGOMERY COUNTY,0.5155,1
388,Joseph R. Biden,Democratic,3358,53.13%,RADFORD CITY,0.5313,1
401,Joseph R. Biden,Democratic,53099,65.39%,NEWPORT NEWS CITY,0.6539,1


In [20]:
## Create dummy for Republican majority in Presidential election
vde2["P_Republican"] = np.where(((vde2['Candidate'] == "Donald J. Trump") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['P_Republican'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat,P_Republican
1,Donald J. Trump,Republican,13366,80.45%,WISE COUNTY,0.8045,0,1
61,Donald J. Trump,Republican,11041,64.93%,AMHERST COUNTY,0.6493,0,1
80,Donald J. Trump,Republican,9063,83.38%,SCOTT COUNTY,0.8338,0,1
93,Donald J. Trump,Republican,20895,70.35%,FRANKLIN COUNTY,0.7035,0,1
138,Donald J. Trump,Republican,4196,61.84%,MIDDLESEX COUNTY,0.6184,0,1
...,...,...,...,...,...,...,...,...
2671,Donald J. Trump,Republican,5300,65.20%,MADISON COUNTY,0.6520,0,1
2685,Donald J. Trump,Republican,23751,69.39%,PITTSYLVANIA COUNTY,0.6939,0,1
2787,Donald J. Trump,Republican,21245,71.07%,CAMPBELL COUNTY,0.7107,0,1
2813,Donald J. Trump,Republican,5318,53.54%,WESTMORELAND COUNTY,0.5354,0,1


In [21]:
## Create dummy for Democratic majority in Senate election
vde2["S_Democrat"] = np.where(((vde2['Candidate'] == "Mark R. Warner") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['S_Democrat'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat,P_Republican,S_Democrat
17,Mark R. Warner,Democratic,3889,58.16%,NORTHAMPTON COUNTY,0.5816,0,0,1
112,Mark R. Warner,Democratic,8495,67.48%,FREDERICKSBURG CITY,0.6748,0,0,1
126,Mark R. Warner,Democratic,4811,70.33%,WILLIAMSBURG CITY,0.7033,0,0,1
218,Mark R. Warner,Democratic,2439,55.43%,SURRY COUNTY,0.5543,0,0,1
232,Mark R. Warner,Democratic,120753,53.91%,VIRGINIA BEACH CITY,0.5391,0,0,1
293,Mark R. Warner,Democratic,7289,56.72%,STAUNTON CITY,0.5672,0,0,1
307,Mark R. Warner,Democratic,5195,54.68%,PRINCE EDWARD COUNTY,0.5468,0,0,1
365,Mark R. Warner,Democratic,24589,55.09%,MONTGOMERY COUNTY,0.5509,0,0,1
392,Mark R. Warner,Democratic,3551,56.82%,RADFORD CITY,0.5682,0,0,1
405,Mark R. Warner,Democratic,53265,67.61%,NEWPORT NEWS CITY,0.6761,0,0,1


In [22]:
## Create dummy for Republican majority in Senate election
vde2["S_Republican"] = np.where(((vde2['Candidate'] == "Daniel M. Gade") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['S_Republican'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat,P_Republican,S_Democrat,S_Republican
5,Daniel M. Gade,Republican,12096,74.46%,WISE COUNTY,0.7446,0,0,0,1
65,Daniel M. Gade,Republican,10624,62.85%,AMHERST COUNTY,0.6285,0,0,0,1
84,Daniel M. Gade,Republican,8543,79.46%,SCOTT COUNTY,0.7946,0,0,0,1
97,Daniel M. Gade,Republican,19505,66.26%,FRANKLIN COUNTY,0.6626,0,0,0,1
142,Daniel M. Gade,Republican,4080,60.53%,MIDDLESEX COUNTY,0.6053,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
2675,Daniel M. Gade,Republican,5156,63.75%,MADISON COUNTY,0.6375,0,0,0,1
2689,Daniel M. Gade,Republican,21993,65.60%,PITTSYLVANIA COUNTY,0.6560,0,0,0,1
2791,Daniel M. Gade,Republican,20337,68.60%,CAMPBELL COUNTY,0.6860,0,0,0,1
2817,Daniel M. Gade,Republican,5009,50.94%,WESTMORELAND COUNTY,0.5094,0,0,0,1


In [23]:
## Export for review
vde2.to_csv("data/build/build2_vde.csv", index=False)

In [24]:
## Aggregate to county level for merge to 'Base'
locality = pd.pivot_table(vde2, index= ['Locality'], values= ['P_Democrat','P_Republican','S_Democrat','S_Republican'], aggfunc= np.sum)

locality.head()

Unnamed: 0_level_0,P_Democrat,P_Republican,S_Democrat,S_Republican
Locality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ACCOMACK COUNTY,0,1,0,1
ALBEMARLE COUNTY,1,0,1,0
ALEXANDRIA CITY,1,0,1,0
ALLEGHANY COUNTY,0,1,0,1
AMELIA COUNTY,0,1,0,1


In [25]:
## Export for merge to base
locality.to_csv("data/build/build2_locality.csv")