In [1]:
# imports and set API url
import requests
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup

# set base url and get homepage html
res = requests.get('https://www.sos.state.tx.us/elections/historical/counties.shtml')
url = 'https://www.sos.state.tx.us/elections/historical/'

In [2]:
# check good request
res.status_code

200

In [3]:
# scrape county link objects
soup = BeautifulSoup(res.content, 'lxml')
abc = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
counties = soup.find_all('a', {'href': lambda x: x and x.endswith('.shtml')})
counties = counties[26:-7]

In [4]:
# get list of relative urls for each county
links = []
for i, county in enumerate(counties):
    links.append(counties[i].attrs['href'])

# get only unique values
links = list(set(links))
# put in alphabetical order so debugging is easier
links = sorted(links)

In [5]:
# for each county link in our "links" list, create a dataframe off of the table
for link in links:
    # reset bool
    scraped_last = False
    
    # isolate county name
    county_name = link.split('.')[0]
    
    # initialize data
    data = []
    
    # request data
    res = requests.get(url+link)
    
    # print in case request doesn't work
    if res.status_code != 200:
        print('BAD REQUEST')
     
    # get content from page
    soup = BeautifulSoup(res.content, 'lxml')
    
    # isolate table html
    sub_soup = soup.find('table')
    
    # get all the rows of the table
    rows = sub_soup.find_all('tr')
    
    # iterate through rows in the table
    for i, row in enumerate(rows):
        # initialize empty dictionary
        table = {}
        # grab just the text (not tags) from each value in the row
        row_data = [value.text for value in rows[i].find_all(['th', 'td'])]
        # as long as there is actual values in the row_data, put the values in a dict
        if row_data != []:
            table['Year'] = row_data[0]
            if table['Year'] == '2020':
                scraped_last = True # if we capture 2020, set this to True to skip the if statement below
            table['Reg Voters'] = row_data[1]
            table['Voted'] = row_data[2]
            table['Voted %'] = row_data[3]
            table['Early Vote'] = row_data[4]
            table['EV %'] = row_data[5]
            # append this dictionary to data
            data.append(table)

    
    # 2020 does not have a 'tr' wrapper - have to grab it manually 
    # if we already scraped it in the for loop above this will not run
    if scraped_last == False:
        # get last 5 values
        last_year = sub_soup.find_all('td', {'class': 'align-c'})[-5:]
        # repeat process in the for loop
        last_year_data = [value.text for value in last_year]
        table = {}
        table['Year'] = 2020
        table['Reg Voters'] = last_year_data[0]
        table['Voted'] = last_year_data[1]
        table['Voted %'] = last_year_data[2]
        table['Early Vote'] = last_year_data[3]
        table['EV %'] = last_year_data[4]
        # append this dictionary to data
        data.append(table)
    # make dataframe and drop the first column which is a duplicate of our headers
    df = pd.DataFrame(data).drop(0)
    # add column to capture county
    df['county'] = county_name
    # save csv in data folder
    df.to_csv(f'voting_data/{county_name}.csv', index=False)
    # print county name to track progress
    print(f'{county_name} done!')

anderson done!
andrews done!
angelina done!
aransas done!
archer done!
armstrong done!
atascosa done!
austin done!
bailey done!
bandera done!
bastrop done!
baylor done!
bee done!
bell done!
bexar done!
blanco done!
borden done!
bosque done!
bowie done!
brazoria done!
brazos done!
brewster done!
briscoe done!
brooks done!
brown done!
burleson done!
burnet done!
caldwell done!
calhoun done!
callahan done!
cameron done!
camp done!
carson done!
cass done!
castro done!
chambers done!
cherokee done!
childress done!
clay done!
cochran done!
coke done!
coleman done!
collin done!
collingsworth done!
colorado done!
comal done!
comanche done!
concho done!
cooke done!
coryell done!
cottle done!
crane done!
crockett done!
crosby done!
culberson done!
dallam done!
dallas done!
dawson done!
deafsmith done!
delta done!
denton done!
dewitt done!
dickens done!
dimmit done!
donley done!
duval done!
eastland done!
ector done!
edwards done!
ellis done!
elpaso done!
erath done!
falls done!
fannin done!
faye

In [6]:
# code from stack overflow 
# https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
import glob
import os
path = 'voting_data/'                    
all_files = glob.glob(os.path.join(path, "*.csv"))     
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)

In [7]:
concatenated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4311 entries, 0 to 4310
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        4311 non-null   int64 
 1   Reg Voters  4311 non-null   object
 2   Voted       4311 non-null   object
 3   Voted %     4311 non-null   object
 4   Early Vote  4310 non-null   object
 5   EV %        4311 non-null   object
 6   county      4311 non-null   object
dtypes: int64(1), object(6)
memory usage: 235.9+ KB


In [8]:
select_years = concatenated_df[(concatenated_df['Year'] == 2012) | \
                               (concatenated_df['Year'] == 2016) | \
                               (concatenated_df['Year'] == 2020)].reset_index(drop=True)



In [9]:
select_years

Unnamed: 0,Year,Reg Voters,Voted,Voted %,Early Vote,EV %,county
0,2012,11977,7343,61.31%,2563,21.40%,bosque
1,2016,12002,7823,65.18%,3898,32.48%,bosque
2,2020,12724,9094,71.47%,6535,71.86%,bosque
3,2012,22565,13944,61.79%,8713,38.61%,brown
4,2016,23424,14018,59.84%,9986,42.63%,brown
...,...,...,...,...,...,...,...
757,2016,3151,766,43.47%,396,22.47%,dallam
758,2020,3046,1605,52.69%,1029,64.11%,dallam
759,2012,11535,7316,63.42%,5554,48.15%,young
760,2016,11762,7689,65.37%,5455,46.38%,young


In [10]:
select_years.to_csv('output/voting_numbers.csv')