In [1]:
import pandas

In [69]:
def check_order_of_candidates(line):
    words = [s.lower() for s in line.split()]
    trump_index = words.index("trump")
    clinton_index = words.index("clinton")
    johnson_index = words.index("johnson")
    return (trump_index < clinton_index) and (clinton_index < johnson_index)

def find_row_range(filename):
    try:
        with open(filename) as f:
            for i, line in enumerate(f):
                if "TRUMP" in line:
                    assert check_order_of_candidates(line)
                if "President" in line:
                    start = i + 2
                if "Totals:" in line:
                    stop = i - 1
                    return start, stop - start
    except Exception as e:
        print(filename)
        print(e)
        assert False

In [54]:
skiprows, nrows = find_row_range("./Appling/detail.txt")

print(skiprows, nrows)

24 14


In [None]:
column_suffixes = {
    "Election Day": "_ED",
    "Absentee by Mail": "_AB",
    "Absentee By Mail": "_AB",
    "Advance in Person": "_AD",
    "Advance In Person": "_AD",
    "Provisional": "_PR",
    "Choice Total": "",
}

column_map = {
    "Registered Voters": "REG_VOTE",
    "Precinct": "PRECINCT",
}

candidate_map = {
    "": "PRES16R",
    ".1": "PRES16D",
    ".2": "PRES16L"
}

for identifier, candidate_col in candidate_map.items():
    for col, suffix in column_suffixes.items():
        column_map[col + identifier] = candidate_col + suffix

column_map

In [65]:
import os
import shutil
from tqdm import tqdm

In [61]:
ignore = {"data", "examples", ".git", ".ipynb_checkpoints", "ResolvingOverlaps.ipynb", "Untitled.ipynb"}

counties = set(os.listdir(".")) - ignore

In [63]:
for county in counties:
    shutil.move(county, f"./data/{county}")

In [147]:
def sum_advance_columns(df):
    num_columns = 0
    cols = [col for col in df.columns if "advance in person" in col.lower() and "." not in col]
    if len(cols) > 1:
        advance_columns_map = {
            suffix: [f"{col}{suffix}" for col in cols]
            for suffix in ["", ".1", ".2"]
        }
        for suffix, advance_columns in advance_columns_map.items():
            df["Advance in Person" + suffix] = df[advance_columns].sum(axis=1)
        columns_to_drop = [col for cols in advance_columns_map.values() for col in cols]
        for suffix in advance_columns_map:
            if "Advance in Person" + suffix in columns_to_drop:
                columns_to_drop.remove("Advance in Person" + suffix)
        return df.drop(columns_to_drop, axis="columns")
    return df

In [148]:
def read_county_txt(county):
    filename = f"./data/{county}/detail.txt"
    skiprows, nrows = find_row_range(filename)
    df = pandas.read_csv(filename, sep="\s\s+", skiprows=skiprows, nrows=nrows, engine="python")
    df = sum_advance_columns(df)
    result = df.rename(column_map, axis="columns").drop("Total", axis="columns")
    result["COUNTY"] = county
    return result

In [149]:
counties = os.listdir("./data/")
county_dataframes = [read_county_txt(county) for county in counties]

In [150]:
columns = set(county_dataframes[0].columns)

for df in county_dataframes:
    if set(df.columns) != columns:
        print(columns - set(df.columns))


In [151]:
df = pandas.concat(county_dataframes, ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [155]:
df.describe()

Unnamed: 0,PRES16D,PRES16D_AB,PRES16D_AD,PRES16D_ED,PRES16D_PR,PRES16L,PRES16L_AB,PRES16L_AD,PRES16L_ED,PRES16L_PR,PRES16R,PRES16R_AB,PRES16R_AD,PRES16R_ED,PRES16R_PR,REG_VOTE
count,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0,2692.0
mean,682.837667,41.355126,373.660104,271.84101,17.320579,77.202823,2.608098,18.072808,25.001486,1.117756,769.384844,42.657504,410.819094,311.699851,16.031947,1987.051634
std,641.838698,56.116658,401.466021,239.518107,122.276381,223.432281,3.514558,22.815872,25.326189,7.726312,799.99898,59.449801,536.449488,280.533208,121.211804,1514.248351
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
25%,177.75,11.0,76.0,79.0,0.0,13.0,0.0,4.0,6.0,0.0,218.0,9.0,82.0,94.0,0.0,899.25
50%,498.0,28.0,251.0,206.0,0.0,33.0,1.0,12.0,17.0,0.0,579.5,26.0,260.0,242.0,0.0,1747.0
75%,987.0,53.0,533.0,411.25,2.0,71.0,4.0,25.0,36.25,0.0,1052.0,57.0,550.0,442.25,1.0,2698.25
max,7147.0,1110.0,4849.0,1888.0,2351.0,3743.0,39.0,386.0,185.0,129.0,9619.0,912.0,7056.0,2163.0,1604.0,21447.0


In [156]:
df.to_csv("./GA_precincts_with_absentee.csv")

In [2]:
df = pandas.read_csv("./GA_precincts_with_absentee.csv")

In [3]:
import geopandas

In [4]:
gdf = geopandas.read_file("https://github.com/mggg-states/GA-shapefiles/raw/master/GA_precincts16.zip")

In [10]:
from fuzzywuzzy import process

In [6]:
df[df["COUNTY"] == "Echols"]["PRECINCT"]

1056    Statenville
Name: PRECINCT, dtype: object

In [9]:
gdf[gdf["CTYNAME"] == "Echols"]["PRECINCT_N"]

825    STATENVILLE
Name: PRECINCT_N, dtype: object

In [16]:
def search_for_matches(county):
    choices = gdf[gdf["CTYNAME"] == county]["PRECINCT_N"]
    return df[df["COUNTY"] == county]["PRECINCT"].apply(lambda name: process.extractOne(name, choices))

In [17]:
counties = df["COUNTY"].unique()

In [25]:
df.columns

Index(['Unnamed: 0', 'COUNTY', 'PRECINCT', 'PRES16D', 'PRES16D_AB',
       'PRES16D_AD', 'PRES16D_ED', 'PRES16D_PR', 'PRES16L', 'PRES16L_AB',
       'PRES16L_AD', 'PRES16L_ED', 'PRES16L_PR', 'PRES16R', 'PRES16R_AB',
       'PRES16R_AD', 'PRES16R_ED', 'PRES16R_PR', 'REG_VOTE'],
      dtype='object')

In [27]:
df.drop('Unnamed: 0', axis="columns", inplace=True)

In [29]:
df = df.replace("COUNTY", {"Ben_Hill": "Ben Hill", "Jeff_Davis": "Jeff Davis"})

In [30]:
df.to_csv("./GA_precincts_with_absentee.csv", index=False)

In [19]:
for county in counties:
    choices = gdf[gdf["CTYNAME"] == county]["PRECINCT_N"]
    if len(choices) == 0:
        print(county)

Ben_Hill
Jeff_Davis


In [33]:
(gdf.groupby("CTYNAME").size() == 1).sum()

13

In [34]:
(df.groupby("COUNTY").size() == 1).sum()

12

In [36]:
df["COUNTY"].nunique() - gdf["CTYNAME"].nunique()

-1

In [37]:
df.replace("Chattooga")

array(['Columbia', 'Clayton', 'Lincoln', 'McDuffie', 'Fayette', 'Fulton',
       'Taliaferro', 'Wilkes', 'Greene', 'Oconee', 'Banks', 'Morgan',
       'Walton', 'Newton', 'Union', 'Rockdale', 'Gwinnett', 'DeKalb',
       'Henry', 'Pickens', 'Gilmer', 'Troup', 'Heard', 'Carroll',
       'Paulding', 'Douglas', 'Whitfield', 'Coweta', 'Clay', 'Early',
       'Haralson', 'Spalding', 'Screven', 'Burke', 'Chattooga',
       'Richmond', 'Warren', 'Jefferson', 'Hancock', 'Twiggs', 'Jones',
       'Thomas', 'Brooks', 'Jasper', 'Butts', 'Monroe', 'Bibb', 'Sumter',
       'Dooly', 'Macon', 'Peach', 'Crawford', 'Houston', 'Tift', 'Worth',
       'Crisp', 'Brantley', 'Colquitt', 'Lowndes', 'Lee', 'Lamar', 'Pike',
       'Grady', 'Mitchell', 'Randolph', 'Calhoun', 'Terrell', 'Dougherty',
       'Marion', 'Seminole', 'Decatur', 'Miller', 'Baker', 'Webster',
       'Muscogee', 'Quitman', 'Stewart', 'Harris', 'Meriwether', 'Taylor',
       'Upson', 'Schley', 'Talbot', 'Charlton', 'Camden', 'Turner',
   