In [382]:
## IMPORTS
from sklearn.cluster import k_means
from scipy.spatial import KDTree
import pandas as pd
import csv
import json

# Returns dataframe with columns Year (int), County (str, as name#state), TotalVotes (int), and requested features
def get_df(features):
    cols = set(features) | set(['Year', 'State', 'Area', 'TotalVotes'])
    rows = []
    for state in json.load(open('../metadata/states.json')):
        for year in json.load(open('../metadata/years.json')):
            header = []
            for line in csv.reader(open('../data/votes-' + str(year) + '-' + str(state) + '.csv', 'r').readlines()):
                if line[0] == 'Office':
                    header = line
                elif header and line[header.index('Office')] == 'President':
                    row = {'Year': int(year)}
                    for col in (cols - set(['Year'])):
                        row[col] = line[header.index(col)]
                    rows.append(row)
    df = pd.DataFrame(rows)
    df['County'] = (df['Area'] + '#' + df['State']).apply(lambda x: x.lower())
    df.drop(['Area', 'State'], axis=1)
    df['TotalVotes'] = df['TotalVotes'].apply(lambda x: int(x.replace(',', '')))
    df['RepVotesTotalPercent'] = df['RepVotesTotalPercent'].apply(lambda x: float(x))
    df['DemVotesTotalPercent'] = df['DemVotesTotalPercent'].apply(lambda x: float(x))
    df['ThirdVotesTotalPercent'] = df['ThirdVotesTotalPercent'].apply(lambda x: float(x))
    return df

# Use years in inclusive range [start_year, end_year] to make num_clusters clusters of counties
def compute_clusters(df, num_clusters, features, start_year, end_year, write_file=True):
    
    # Filter data and reshape into feature_vectors
    feature_vectors = pd.pivot_table(df[(df.Year >= start_year) & (df.Year <= end_year)],
                            index='County', 
                            columns='Year', 
                            values=features, 
                            aggfunc='mean').dropna()

    # Identify counties used and skipped
    counties_used = list(feature_vectors.index)
    counties = [county[0].lower() + '#' + county[1].lower() for county in json.load(open('../metadata/counties.json'))]
    counties_skipped = list(set(counties) - set(counties_used))
    
    # Run K-Means to get clusters (list of county IDs)
    (means, cluster_assignments, inertia) = k_means(feature_vectors.as_matrix(), num_clusters)
    clusters = [[counties_used[i] for i in range(len(cluster_assignments)) if cluster_assignments[i] == k] for k in range(num_clusters)]
    
    # Find representative 
    tree = KDTree(feature_vectors.as_matrix())
    representatives = [counties_used[tree.query(means[k])[1]] for k in range(num_clusters)]        

    # Write results
    results = {
        'clusters': [[county.split('#') for county in clusters[i]] for i in range(len(clusters))],
        'representatives': [county.split('#') for county in representatives],
        'skipped': [county.split('#') for county in counties_skipped],
        'raw_clusters': clusters,
        'inertia': inertia
    }
    if write_file:
        json.dump(results, open('../viz/assets/json/results.json', 'w'))
        
    return results

def predict(df, features, year, num_clusters, num_elections, write_file=True):
    results = compute_clusters(df, num_clusters, features, year - 4 * num_elections, year - 4)
    clusters = results['raw_clusters']
    representatives = ['#'.join(county) for county in results['representatives']]
    df_last = df[df.Year == (year-4)]
    df_next = df[df.Year == year]
    votes = []
    for i in range(len(clusters)):
        representative_last = df_last[df_last['County'] == representatives[i]]
        representative_next = df_next[df_next['County'] == representatives[i]]

        # take total from last and extrapolate next using change ratio of representative
        total = float(df_last[df_last['County'].isin(clusters[i])]['TotalVotes'].sum())
        total *= float(representative_next['TotalVotes']) / float(representative_last['TotalVotes'])

        votes.append({
            'repCount': float(representative_next['RepVotesTotalPercent'].mean()) / 100 * total,
            'demCount': float(representative_next['DemVotesTotalPercent'].mean()) / 100 * total, 
            'thirdCount': float(representative_next['ThirdVotesTotalPercent'].mean()) / 100 * total,
            'totalCount': total
        })
    
    prediction = {party: sum([v[party] for v in votes]) for party in ['repCount', 'demCount', 'thirdCount', 'totalCount']}    
    prediction['rep'] = prediction['repCount'] / prediction['totalCount']
    prediction['dem'] = prediction['demCount'] / prediction['totalCount']

    actual = get_actual(year)
    
    error = 0.5 * (abs(prediction['dem'] - actual['dem']) + abs(prediction['rep'] - actual['rep']))
    
    summary = str(year) + ', ' + str(num_clusters) + ' clusters, ' + str(num_elections) + ' elections (error ' + str(error) + '):\n' + \
                'actual / prediction\n' + \
                'Rep: ' + str(actual['rep']) + ' / ' + str(prediction['rep']) + '\n' \
                'Dem: ' + str(actual['dem']) + ' / ' + str(prediction['dem']) + '\n'

    
    cluster_sizes = sorted([len(c) for c in clusters], reverse=True)

    return summary, error, prediction, actual, cluster_sizes

def get_actual(year):
    popular_vote = json.load(open('../metadata/popularvote.json'))
    return {
        'dem': popular_vote['dem'][str(year)],
        'rep': popular_vote['rep'][str(year)]
    }

In [369]:
# Choose features
features = set(['RepVotesTotalPercent', 'DemVotesTotalPercent', 'ThirdVotesTotalPercent', 'TotalVotes'])
df = get_df(features)
all_years = [1940, 1944, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1992, 1996, 2000, 2004, 2008, 2012, 2016]

In [388]:
# Run over one trial
year = 1980
features = set(['RepVotesTotalPercent', 'DemVotesTotalPercent', 'ThirdVotesTotalPercent', 'TotalVotes'])
summary, error, prediction, actual, cluster_sizes = predict(df, features, year, 10, 2)
print cluster_sizes
print summary

[2857, 192, 63, 23, 2]
1980, 5 clusters, 2 elections (error 0.0184178598891):
actual / prediction
Rep: 0.505 / 0.519259609045
Dem: 0.41 / 0.387423889266



In [395]:
# Run over a few trials
features = set(['RepVotesTotalPercent', 'DemVotesTotalPercent', 'ThirdVotesTotalPercent', 'TotalVotes'])
for year in all_years:
    summary, error, prediction, actual, cluster_sizes = predict(df, features, year, 30, 2)
    if (prediction['dem'] - prediction['rep']) * (actual['dem'] - actual['rep']) > 0:
        print year, 'CORRECT!'
    else:
        print year, 'incorrect'

1940 CORRECT!
1944 CORRECT!
1948 CORRECT!
1952 CORRECT!
1956 CORRECT!
1960 incorrect
1964 CORRECT!
1968 CORRECT!
1972 CORRECT!
1976 CORRECT!
1980 CORRECT!
1984 CORRECT!
1992 CORRECT!
1996 CORRECT!
2000 incorrect
2004 incorrect
2008 CORRECT!
2012 CORRECT!
2016 incorrect


In [373]:
# Run over a lot of trials
features = set(['RepVotesTotalPercent', 'DemVotesTotalPercent', 'ThirdVotesTotalPercent', 'TotalVotes'])
years = all_years
cluster_errors = {}
for num_clusters in [2, 3, 4, 6, 8, 10, 15, 20, 30]:
    election_errors = {}
    for num_elections in [1, 2, 3, 4, 6, 8, 10, 15]:
        trial_error = 0
        for year in years:
            summary, error, prediction, actual, cluster_sizes = predict(df, features, year, num_clusters, num_elections, write_file=False)
            trial_error += error / len(years)
        print num_clusters, num_elections, trial_error
        election_errors[num_elections] = trial_error
    cluster_errors[num_clusters] = election_errors
edf = pd.DataFrame(cluster_errors)
print edf

2 1 0.073043956755
2 2 0.0569159917358
2 3 0.0697489590105
2 4 0.0545457752093
2 6 0.0790772891417
2 8 0.0778298374048
2 10 0.0726855380605
2 15 0.0863017300484
3 1 0.0424365482221
3 2 0.0505050874849
3 3 0.0482807648694
3 4 0.0579875387279
3 6 0.0467638738291
3 8 0.0461400595029
3 10 0.061911980891
3 15 0.0634736918971
4 1 0.0347439331754
4 2 0.057268856627
4 3 0.0665506759594
4 4 0.0547931263136
4 6 0.0491616935252
4 8 0.0634451283216
4 10 0.0551536219114
4 15 0.0494043514641
6 1 0.0391591184691
6 2 0.0390004840982
6 3 0.041644775277
6 4 0.0294994229019
6 6 0.0294901445515
6 8 0.0310455238273
6 10 0.025989880706
6 15 0.0374462423559
8 1 0.0220745062232
8 2 0.0388044967121
8 3 0.040110594782
8 4 0.0228715677057
8 6 0.0408370053864
8 8 0.0338706526037
8 10 0.0362582976574
8 15 0.0372824539185
10 1 0.0242625697076
10 2 0.0239724382334
10 3 0.0322964513585
10 4 0.0245465763298
10 6 0.0284395562234
10 8 0.0285609876942
10 10 0.028488195227
10 15 0.0284062808574
15 1 0.0264373960536
15 2 0

In [375]:
edf

Unnamed: 0,2,3,4,6,8,10,15,20,30
1,0.073044,0.042437,0.034744,0.039159,0.022075,0.024263,0.026437,0.019374,0.013036
2,0.056916,0.050505,0.057269,0.039,0.038804,0.023972,0.022868,0.025526,0.014851
3,0.069749,0.048281,0.066551,0.041645,0.040111,0.032296,0.023113,0.023166,0.019445
4,0.054546,0.057988,0.054793,0.029499,0.022872,0.024547,0.033429,0.02169,0.019981
6,0.079077,0.046764,0.049162,0.02949,0.040837,0.02844,0.030741,0.013423,0.013832
8,0.07783,0.04614,0.063445,0.031046,0.033871,0.028561,0.025838,0.020281,0.019217
10,0.072686,0.061912,0.055154,0.02599,0.036258,0.028488,0.025947,0.025888,0.021061
15,0.086302,0.063474,0.049404,0.037446,0.037282,0.028406,0.034729,0.023229,0.015078
