#### imports

In [13]:
import re
import os
import pandas as pd


### read data
**`detail.txt` file** downloaded directly from **https://results.enr.clarityelections.com/GA/105369**

In [14]:
with open('raw/detail.txt') as pop:
    pop = pop.read()
    
delim = lambda x: re.split('(?:\s){3,}', x) # delimit cells by 3 or more spaces
rows = [delim(r) for r in pop.split('\n')] # delimit rows with line breaks

### split data for different races

In [15]:
# function to get the category of each race
# ( for organizing the repository )
def parse_cat(race):
    if 'Service' in race:
        return ' '.join(race.split()[:3])
    else:
        return ' '.join(race.split()[:2]).replace('President of', 'US President')
    
    
# find the consecutive batch of rows associated with each race

data = [] # to compile info on all races

for i, row in enumerate(rows[1:-2]):
    row_data = {}
    if row==['']: # indicates the beginning of data on a row
        
        if i>10: # identify the last row of & save the previous race
            last_row_data['data_ends'] = i-1
            last_row_data['data'] = rows[ last_row_data['data_starts'] : i ] 
            data.append(last_row_data)         
        
        if i < (len(rows)-10): # identify the first row of & other details on this race 
            row_data['race'] = ''.join(rows[i+2])
            row_data['race_cat'] = parse_cat(row_data['race'])
            row_data['candidates'] = rows[i+3]
            row_data['data_starts'] = i+4 
            last_row_data = row_data.copy()
print(len(data), 'total races found.')

297 total races found.


### parse & clean data for each race
- Rename columns to include candidates
- Categorize race type
- 

In [30]:
def parse_race_data(race_idx, target_dir='data'):
    race_data = pd.DataFrame(data[race_idx]['data'])
    race_data.columns = race_data.loc[0]
    race_data.drop(0, inplace=True)
    race_data = race_data.set_index('County')

    candidates = [c for c in data[race_idx]['candidates'] if len(c)>0]


    # rename columns to include candidate name
    stats = list(race_data.columns[:5].str.replace('Choice Total', 'TOTAL VOTES'))
    stats
    new_cols = []
    for cand in candidates:
        for stat in stats:
            new_cols.append(cand+'_'+stat) # create list of specified features
    cols_affected = (5*len(candidates)) # add on unchanged features (total)
    new_cols += list(race_data.columns[cols_affected:])
    race_data.columns = new_cols # apply the new columns
    
    # create directories & file details
    race_cat = data[race_idx]['race_cat']
    race_name = data[race_idx]['race'].split('/')[0]
    
    if target_dir not in os.listdir():
        os.makedirs(f'{target_dir}/')
    if race_cat not in os.listdir(target_dir):
        os.makedirs(f'{target_dir}/{race_cat}/')

    # save csv
    race_data.to_csv(f'{target_dir}/{race_cat}/{race_name}.csv') # keep the index! it's the county.

### parse & save data for all races

In [17]:
for race_idx in range(len(data)):
    parse_race_data(race_idx)

#### verify 

In [31]:
df = pd.read_csv('data/US Senate/US Senate (Loeffler) - Special.csv')
df[['Raphael Warnock (Dem)_TOTAL VOTES',
    'Doug Collins (Rep)_TOTAL VOTES',
    'Kelly Loeffler (I) (Rep)_TOTAL VOTES']].sum()

# VERIFIED AGAINST DATA FROM STATE ONLINE DASHBOARD - Nov. 20th
# results.enr.clarityelections.com/GA/105369

Raphael Warnock (Dem)_TOTAL VOTES       1613785
Doug Collins (Rep)_TOTAL VOTES           978668
Kelly Loeffler (I) (Rep)_TOTAL VOTES    1270732
dtype: int64