# 2020 Election Data - Virginia

## Purpose:
* Scrape Virginia Department of Election for 2020 voter preferences by locality

## Dependencies

In [77]:
# Dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import numpy as np

## For Multiple Outputs 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
## Assign website url
vde_url = "https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/Index.html"

In [5]:
## Connect to url
vde_page = requests.get(vde_url)

## Confirm connection
vde_page.status_code

200

In [6]:
## Parse website
vde_page_parsed = BeautifulSoup(vde_page.content, "html.parser")

## Check parsed website
vde_page_parsed

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, maximum-scale=1, minimum-scale=1" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="en-us" http-equiv="Content-Language"/>
<meta content="en-US" name="language"/>
<meta content="max-age=0" http-equiv="cache-control">
<meta content="no-cache" http-equiv="cache-control">
<meta content="0" http-equiv="expires">
<meta content="Tue, 01 Jan 1980 1:00:00 GMT" http-equiv="expires"/>
<meta content="no-cache" http-equiv="pragma"/>
<!--[if IE 7]>
<html class="ie ie7 no-js" lang="en-US">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8 no-js" lang="en-US">
<style type="text/css">

   .header_container { 
       background-color:#FFFFFF;       
    } 
  .stickymenu {
 background-color:#FFFFFF; 
border-bottom:2px solid #CCCCCC;
}

    </style>
<![endif]-->
<!--[if !(I

In [26]:
## Check some "a" tags
vde_tag = vde_page_parsed.find_all("a")

vde_tag


[<a accesskey="c" href="#skipPoint" tabindex="1" title="Skip to Content">Skip to Content</a>,
 <a class="virginiaLogo" href="https://www.virginia.gov/" tabindex="2" title="Virginia.gov portal"><span class="hidden-alt-text">Virginia.gov portal</span></a>,
 <a href="https://www.virginia.gov/government/state-employees/agency-directory" tabindex="3" title="Virginia agency websites">Agencies</a>,
 <a href="https://www.governor.virginia.gov" tabindex="4" title="Virginia Governor">Governor</a>,
 <a accesskey="s" href="https://www.virginia.gov/search" tabindex="5" title="Search Virginia.Gov">Search Virginia.Gov</a>,
 <a href="https://www.elections.virginia.gov/">Home</a>,
 <a href="https://www.elections.virginia.gov/registration/">Registration</a>,
 <a href="https://www.elections.virginia.gov/registration/how-to-register/">How to Register</a>,
 <a href="https://www.elections.virginia.gov/registration/view-your-info/">View Your Info</a>,
 <a href="https://www.elections.virginia.gov/registration

In [29]:
## Extract relevant links
links = set()
for vde_tag in vde_page_parsed.find_all("a"):
    if vde_tag.attrs.get("href") is not None:
        href = vde_tag.attrs.get("href")
        ## Limit hyper links to 'COUNTY' or 'CITY'
        if ("COUNTY" in href or "CITY" in href) and "https:" not in href:
            links.update(["https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/" + href[2:]])

## Check links
links

{'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ACCOMACK_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ALBEMARLE_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ALEXANDRIA_CITY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ALLEGHANY_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/AMELIA_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/AMHERST_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/APPOMATTOX_COUNTY/Index.html',
 'https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/ARLINGTON_COUNTY/Index.html',
 'https://results.elec

In [31]:
test_url = list(links)[0]

In [32]:
## Connect to url
page = requests.get(test_url)

## Parse website
soup = BeautifulSoup(page.content, 'html.parser')

soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, maximum-scale=1, minimum-scale=1" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="en-us" http-equiv="Content-Language"/>
<meta content="en-US" name="language"/>
<meta content="max-age=0" http-equiv="cache-control">
<meta content="no-cache" http-equiv="cache-control">
<meta content="0" http-equiv="expires">
<meta content="Tue, 01 Jan 1980 1:00:00 GMT" http-equiv="expires"/>
<meta content="no-cache" http-equiv="pragma"/>
<!--[if IE 7]>
<html class="ie ie7 no-js" lang="en-US">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8 no-js" lang="en-US">
<style type="text/css">

   .header_container { 
       background-color:#FFFFFF;       
    } 
  .stickymenu {
 background-color:#FFFFFF; 
border-bottom:2px solid #CCCCCC;
}

    </style>
<![endif]-->
<!--[if !(I

In [78]:
## Get table information
content = [tr.get_text().split('\n') for tr in soup.find_all("tr")]

## Check content
content[2][1]    
## Check content
content[2][6]

'Donald J. Trump\r'

'53.54%'

In [63]:
def vde_scraper(url):
    '''
    Goal: Obtain all voter percentage of a candidate per locality

    Arguments: Website url (string)

    Returns: Election information per locality from websites (list of lists)
    '''
    ## Connect to url
    page = requests.get(url)

    ## Parse website
    soup = BeautifulSoup(page.content, 'html.parser')

    ## Get country name
    locality = url.replace("https://results.elections.virginia.gov/vaelections/2020%20November%20General/Site/Locality/","")
    locality = locality.replace("/Index.html","")
    locality = locality.replace("_"," ")
    
    ## Get table information
    content = [tr.get_text().split('\n') for tr in soup.find_all("tr")]

    ## Check if there is no "tr" elements first
    if content is not None:
        ## Create empty list to populate
        vde_content = []
        ## Loop through each object of content
        for i in range(len(content)):
            ## Remove table headers
            if content[i][1] != "Candidate" and content[i][1] != "Response":
                ## Append locality name to table
                content[i].append(locality)
                ## Append table information to empty list
                vde_content.append(content[i])

        ## Return list of list
        return vde_content

In [64]:
vde_scraper(test_url)

[['',
  'Joseph R. Biden\r',
  '                                                \t\t\t\t\t\t',
  'Democratic',
  '',
  '4,501',
  '45.31%',
  '',
  'WESTMORELAND COUNTY'],
 ['',
  'Donald J. Trump\r',
  '                                                \t\t\t\t\t\t',
  'Republican',
  '',
  '5,318',
  '53.54%',
  '',
  'WESTMORELAND COUNTY'],
 ['',
  'Jo Jorgensen\r',
  '                                                \t\t\t\t\t\t',
  'Libertarian',
  '',
  '90',
  '0.91%',
  '',
  'WESTMORELAND COUNTY'],
 ['',
  'Write In\r',
  '                                                \t\t\t\t\t\t',
  'Write-In',
  '',
  '24',
  '0.24%',
  '',
  'WESTMORELAND COUNTY'],
 ['',
  'Mark R. Warner\r',
  '                                                \t\t\t\t\t\t',
  'Democratic',
  '',
  '4,821',
  '49.02%',
  '',
  'WESTMORELAND COUNTY'],
 ['',
  'Daniel M. Gade\r',
  '                                                \t\t\t\t\t\t',
  'Republican',
  '',
  '5,009',
  '50.94%',
  '',
  'WESTMORELAND

In [66]:
def link_scrape(urls, sleep=3):
    '''
    Goal: Apply vde_scraper function to all urls

    Arguments: Set or List containing website urls

    Returns: Data frame containing each locality page's voter preferences
    '''
    ## Create empty list
    scraped_data = []

    ## Loop through each url
    for url in urls:
        ## Apply function...
        scrape = vde_scraper(url)
        ## If function does not return None...
        if scrape is not None:
            ## Add to empty list
            scraped_data.extend(vde_scraper(url))
        ## Sleep
        time.sleep(random.uniform(0,sleep))
    ## Convert list of lists to data frame
    df = pd.DataFrame(scraped_data)

    return df

## Store data scraper
vde0 = link_scrape(links)

In [67]:
vde0

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,Joseph R. Biden\r,...,Democratic,,4501,45.31%,,WESTMORELAND COUNTY
1,,Donald J. Trump\r,...,Republican,,5318,53.54%,,WESTMORELAND COUNTY
2,,Jo Jorgensen\r,...,Libertarian,,90,0.91%,,WESTMORELAND COUNTY
3,,Write In\r,...,Write-In,,24,0.24%,,WESTMORELAND COUNTY
4,,Mark R. Warner\r,...,Democratic,,4821,49.02%,,WESTMORELAND COUNTY
...,...,...,...,...,...,...,...,...,...
2860,,Write In\r,...,,,18,0.70%,,NORTON CITY
2861,,Yes\r,...,,,1042,69.61%,,NORTON CITY
2862,,No\r,...,,,455,30.39%,,NORTON CITY
2863,,Yes\r,...,,,1417,91.36%,,NORTON CITY


In [72]:
vde1 = vde0.rename(columns={0:"empty1",
                1:"Candidate",
                2:"empty2",
                3:"Affiliation",
                4:"empty3",
                5:"Count",
                6:"Percentage",
                7:"empty4",
                8:"Locality"})

vde1

Unnamed: 0,empty1,Candidate,empty2,Affiliation,empty3,Count,Percentage,empty4,Locality
0,,Joseph R. Biden\r,...,Democratic,,4501,45.31%,,WESTMORELAND COUNTY
1,,Donald J. Trump\r,...,Republican,,5318,53.54%,,WESTMORELAND COUNTY
2,,Jo Jorgensen\r,...,Libertarian,,90,0.91%,,WESTMORELAND COUNTY
3,,Write In\r,...,Write-In,,24,0.24%,,WESTMORELAND COUNTY
4,,Mark R. Warner\r,...,Democratic,,4821,49.02%,,WESTMORELAND COUNTY
...,...,...,...,...,...,...,...,...,...
2860,,Write In\r,...,,,18,0.70%,,NORTON CITY
2861,,Yes\r,...,,,1042,69.61%,,NORTON CITY
2862,,No\r,...,,,455,30.39%,,NORTON CITY
2863,,Yes\r,...,,,1417,91.36%,,NORTON CITY


In [79]:
vde1['empty1'].unique()
vde1['empty2'].unique()
vde1['empty3'].unique()
vde1['empty4'].unique()

array([''], dtype=object)

array(['                                                \t\t\t\t\t\t'],
      dtype=object)

array([''], dtype=object)

array([''], dtype=object)

In [123]:
## Remove empty columns
vde2 = vde1.drop(columns= ['empty1','empty2','empty3','empty4'])

vde2

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality
0,Joseph R. Biden\r,Democratic,4501,45.31%,WESTMORELAND COUNTY
1,Donald J. Trump\r,Republican,5318,53.54%,WESTMORELAND COUNTY
2,Jo Jorgensen\r,Libertarian,90,0.91%,WESTMORELAND COUNTY
3,Write In\r,Write-In,24,0.24%,WESTMORELAND COUNTY
4,Mark R. Warner\r,Democratic,4821,49.02%,WESTMORELAND COUNTY
...,...,...,...,...,...
2860,Write In\r,,18,0.70%,NORTON CITY
2861,Yes\r,,1042,69.61%,NORTON CITY
2862,No\r,,455,30.39%,NORTON CITY
2863,Yes\r,,1417,91.36%,NORTON CITY


In [124]:
## Remove extra characters in candidate value
vde2['Candidate'] = vde2['Candidate'].str.replace("\r","")

## Check value
vde2['Candidate'].unique()



array(['Joseph R. Biden', 'Donald J. Trump', 'Jo Jorgensen', 'Write In',
       'Mark R. Warner', 'Daniel M. Gade', 'Qasim Rashid',
       'Robert J. Wittman', 'Michael J. Fitzpatrick', 'Robin Schick',
       'Thomas M. Moncure Jr.', 'Richard M. "Mike" Cabrey',
       'Wayne DiRosario', 'Alan Darlington', 'Caryn Self Sullivan',
       'Dallas W. Leamon', 'Joseph "Joey" Paul King',
       'Kathryn S. Wittman', 'Bobby D. Greene', 'Terry A. Cosgrove',
       'Audra Lucas-Peyton', 'Michelle "Shelly" Payne',
       'Theresa "Terri" McClure', 'Brent T. Steffey', 'Yes', 'No',
       'Jennifer T. Wexton', 'Aliscia N. Andrews', 'April F. Wilkerson',
       'Mark J. Griffin', 'Matthew E. "Matt" Bass', 'B. Cameron Webb',
       'Robert G. "Bob" Good', 'Michael E. Mattox',
       'James A. "Champ" Nowlin', 'J. Scott Lowman',
       'James H. Higginbotham II', 'Tracy A. Emerson',
       'Joseph B. "Benny" David ', 'Richard T. Adams Jr.',
       'Scott V. Fisher', 'Roger L. Vance', 'C. B. "Kelly" Bu

In [125]:
## Convert percenmtage to float
vde2['percent'] = vde2['Percentage'].str.rstrip('%').astype('float') / 100.0

vde2['percent'].dtypes

dtype('float64')

In [126]:
## Create dummy for Democratic majority in Presidential election
vde2["P_Democrat"] = np.where(((vde2['Candidate'] == "Joseph R. Biden") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['P_Democrat'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat
79,Joseph R. Biden,Democratic,138372,61.54%,LOUDOUN COUNTY,0.6154,1
146,Joseph R. Biden,Democratic,3667,54.47%,NORTHAMPTON COUNTY,0.5447,1
225,Joseph R. Biden,Democratic,11022,64.51%,HARRISONBURG CITY,0.6451,1
296,Joseph R. Biden,Democratic,11710,60.40%,DANVILLE CITY,0.604,1
336,Joseph R. Biden,Democratic,4552,57.27%,BRUNSWICK COUNTY,0.5727,1
370,Joseph R. Biden,Democratic,1612,67.70%,EMPORIA CITY,0.677,1
419,Joseph R. Biden,Democratic,7146,81.03%,FALLS CHURCH CITY,0.8103,1
465,Joseph R. Biden,Democratic,66240,80.28%,ALEXANDRIA CITY,0.8028,1
479,Joseph R. Biden,Democratic,6610,54.60%,WINCHESTER CITY,0.546,1
535,Joseph R. Biden,Democratic,9174,68.04%,FAIRFAX CITY,0.6804,1


In [127]:
## Create dummy for Republican majority in Presidential election
vde2["P_Republican"] = np.where(((vde2['Candidate'] == "Donald J. Trump") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['P_Republican'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat,P_Republican
1,Donald J. Trump,Republican,5318,53.54%,WESTMORELAND COUNTY,0.5354,0,1
35,Donald J. Trump,Republican,5192,55.61%,CLARKE COUNTY,0.5561,0,1
54,Donald J. Trump,Republican,21245,71.07%,CAMPBELL COUNTY,0.7107,0,1
120,Donald J. Trump,Republican,8365,84.10%,LEE COUNTY,0.8410,0,1
133,Donald J. Trump,Republican,13294,60.66%,LOUISA COUNTY,0.6066,0,1
...,...,...,...,...,...,...,...,...
2609,Donald J. Trump,Republican,25106,57.50%,FAUQUIER COUNTY,0.5750,0,1
2639,Donald J. Trump,Republican,1863,67.11%,BUENA VISTA CITY,0.6711,0,1
2726,Donald J. Trump,Republican,14875,66.76%,GLOUCESTER COUNTY,0.6676,0,1
2756,Donald J. Trump,Republican,10418,57.09%,HALIFAX COUNTY,0.5709,0,1


In [128]:
## Create dummy for Democratic majority in Senate election
vde2["S_Democrat"] = np.where(((vde2['Candidate'] == "Mark R. Warner") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['S_Democrat'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat,P_Republican,S_Democrat
83,Mark R. Warner,Democratic,137814,61.80%,LOUDOUN COUNTY,0.618,0,0,1
150,Mark R. Warner,Democratic,3889,58.16%,NORTHAMPTON COUNTY,0.5816,0,0,1
229,Mark R. Warner,Democratic,11116,66.30%,HARRISONBURG CITY,0.663,0,0,1
300,Mark R. Warner,Democratic,12519,65.16%,DANVILLE CITY,0.6516,0,0,1
340,Mark R. Warner,Democratic,4727,59.68%,BRUNSWICK COUNTY,0.5968,0,0,1
374,Mark R. Warner,Democratic,1618,69.38%,EMPORIA CITY,0.6938,0,0,1
423,Mark R. Warner,Democratic,7016,79.95%,FALLS CHURCH CITY,0.7995,0,0,1
469,Mark R. Warner,Democratic,65071,79.37%,ALEXANDRIA CITY,0.7937,0,0,1
483,Mark R. Warner,Democratic,6898,57.58%,WINCHESTER CITY,0.5758,0,0,1
539,Mark R. Warner,Democratic,9179,68.88%,FAIRFAX CITY,0.6888,0,0,1


In [129]:
## Create dummy for Republican majority in Senate election
vde2["S_Republican"] = np.where(((vde2['Candidate'] == "Daniel M. Gade") & (vde2['percent'] > 0.5)), 1, 0)

## Check
vde2.loc[vde2['S_Republican'] == 1,]

Unnamed: 0,Candidate,Affiliation,Count,Percentage,Locality,percent,P_Democrat,P_Republican,S_Democrat,S_Republican
5,Daniel M. Gade,Republican,5009,50.94%,WESTMORELAND COUNTY,0.5094,0,0,0,1
39,Daniel M. Gade,Republican,5214,56.22%,CLARKE COUNTY,0.5622,0,0,0,1
58,Daniel M. Gade,Republican,20337,68.60%,CAMPBELL COUNTY,0.6860,0,0,0,1
124,Daniel M. Gade,Republican,7600,77.16%,LEE COUNTY,0.7716,0,0,0,1
137,Daniel M. Gade,Republican,13039,59.92%,LOUISA COUNTY,0.5992,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
2613,Daniel M. Gade,Republican,24937,58.75%,FAUQUIER COUNTY,0.5875,0,0,0,1
2643,Daniel M. Gade,Republican,1724,62.76%,BUENA VISTA CITY,0.6276,0,0,0,1
2730,Daniel M. Gade,Republican,14605,65.96%,GLOUCESTER COUNTY,0.6596,0,0,0,1
2760,Daniel M. Gade,Republican,9824,54.31%,HALIFAX COUNTY,0.5431,0,0,0,1


In [130]:
## Export for review
vde2.to_csv("data/build/build2_vde.csv", index=False)

In [134]:
## Aggregate to county level for merge to 'Base'
locality = pd.pivot_table(vde2, index= ['Locality'], values= ['P_Democrat','P_Republican','S_Democrat','S_Republican'], aggfunc= np.sum)

locality.head()

Unnamed: 0_level_0,P_Democrat,P_Republican,S_Democrat,S_Republican
Locality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ACCOMACK COUNTY,0,1,0,1
ALBEMARLE COUNTY,1,0,1,0
ALEXANDRIA CITY,1,0,1,0
ALLEGHANY COUNTY,0,1,0,1
AMELIA COUNTY,0,1,0,1


In [135]:
## Export for merge to base
locality.to_csv("data/build/build2_locality.csv")