In [65]:
import pandas as pd
from scipy import stats

In [75]:
# Load the election data from a URL, making sure to 
# declare that FIPS is a string!  What happens if you don't
# do this?

election_url = 'https://raw.githubusercontent.com/thomaspingel/geodata/master/election/county_election_data_2000-2016.csv'
election_df = pd.read_csv(election_url,dtype={'FIPS':str})
election_df.head()

Unnamed: 0,FIPS,gop_2000_votes,dem_2000_votes,totalvotes_2000,gop_2000_prc,dem_2000_prc,gop_minus_dem_prc_2000,gop_2004_votes,dem_2004_votes,totalvotes_2004,gop_2004_prc,dem_2004_prc,gop_minus_dem_prc_2004,gop_2008_votes,dem_2008_votes,totalvotes_2008,gop_2008_prc,dem_2008_prc,gop_minus_dem_prc_2008,gop_2012_votes,dem_2012_votes,totalvotes_2012,gop_2012_prc,dem_2012_prc,gop_minus_dem_prc_2012,gop_2016_votes,dem_2016_votes,totalvotes_2016,gop_2016_prc,dem_2016_prc,gop_minus_dem_prc_2016
0,1001,11993.0,4942.0,17208.0,69.69,28.72,40.97,15196.0,4758.0,20081.0,75.67,23.69,51.98,17403.0,6093.0,23641.0,73.61,25.77,47.84,17379.0,6363.0,23932.0,72.62,26.59,46.03,18172.0,5936.0,24973.0,72.77,23.77,49.0
1,1003,40872.0,13997.0,56480.0,72.37,24.78,47.59,52971.0,15599.0,69320.0,76.42,22.5,53.92,61271.0,19386.0,81413.0,75.26,23.81,51.45,66016.0,18424.0,85338.0,77.36,21.59,55.77,72883.0,18458.0,95215.0,76.55,19.39,57.16
2,1005,5096.0,5188.0,10395.0,49.02,49.91,-0.89,5899.0,4832.0,10777.0,54.74,44.84,9.9,5866.0,5697.0,11630.0,50.44,48.99,1.45,5550.0,5912.0,11509.0,48.22,51.37,-3.15,5454.0,4871.0,10469.0,52.1,46.53,5.57
3,1007,4273.0,2710.0,7101.0,60.17,38.16,22.01,5472.0,2089.0,7600.0,72.0,27.49,44.51,6262.0,2299.0,8644.0,72.44,26.6,45.84,6132.0,2202.0,8420.0,72.83,26.15,46.68,6738.0,1874.0,8819.0,76.4,21.25,55.15
4,1009,12667.0,4977.0,17973.0,70.48,27.69,42.79,17386.0,3938.0,21504.0,80.85,18.31,62.54,20389.0,3522.0,24267.0,84.02,14.51,69.51,20757.0,2970.0,24006.0,86.47,12.37,74.1,22859.0,2156.0,25588.0,89.33,8.43,80.9


In [88]:
# We can create a column for whether republicans won in 2016
# based on a simple boolean expression:

idx = election_df['gop_2016_prc'] > election_df['dem_2016_prc']
election_df['winner_2016'] = 'Democrat'
election_df.loc[idx,'winner_2016'] = 'Republican'
election_df['winner_2016'].value_counts()

Republican    2650
Democrat       504
Name: winner_2016, dtype: int64

In [89]:
# What's the average voter population for these, by r_wins_2016?

group = election_df.groupby(by='winner_2016')
group['totalvotes_2016'].mean()

winner_2016
Democrat      145001.021825
Republican     23937.530189
Name: totalvotes_2016, dtype: float64

In [92]:
# Is this statistically significant?

A = election_df.loc[election_df.winner_2016=='Republican','totalvotes_2016']
B = election_df.loc[election_df.winner_2016=='Democrat','totalvotes_2016']
stats.ttest_ind(A,B)

# Definitely!  The p-value here is nearly infinitesimal.  

Ttest_indResult(statistic=-21.31612826499807, pvalue=2.686587105417912e-94)

In [93]:
# What's the effect size?

def pooled_sd(A,B,axis=0):
    N1 = np.sum(np.isfinite(A),axis=axis)
    N2 = np.sum(np.isfinite(B),axis=axis)
    SD1 = np.nanstd(A,axis=axis)
    SD2 = np.nanstd(B,axis=axis)
    psd = np.sqrt((((N1-1)*(SD1**2) + (N2-1)*(SD2**2)) / (N1+N2-2)))
    return psd

def cohen_d(A,B,axis=0):
    M = np.nanmean(A,axis=axis) - np.nanmean(B,axis=axis)
    D = M / pooled_sd(A,B,axis=axis)
    return D

cohen_d(A,B)

-1.0367430169584944