In [16]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.special import logit as logit

In [2]:
cluster = pd.read_csv("../PoliticianCluster.csv")
voting = pd.read_csv("../VotingRecord.csv", na_values=['?'])
voting = voting.dropna(axis=0)
voting.columns = ['personID', 'vote', 'party', 'CID', 'BillID']
print(cluster.head())
print(voting.head())
print(voting.columns)
voting = voting[voting['CID'].isin(cluster['CID'])]

         CID  ClusterID
0  N00028958          0
1  N00027860          1
2  N00031681          0
3  N00035825          2
4  N00027509          1
   personID  vote party        CID  BillID
0    400004     0     R  N00003028       3
1    400018     0     R  N00005656       3
2    400021     1     D  N00009774       3
3    400029     2     R  N00025292       3
5    400032     0     R  N00003105       3
Index(['personID', 'vote', 'party', 'CID', 'BillID'], dtype='object')


In [14]:
# A list of all bill ids
bill_id_list = voting['BillID'].unique()
# A list of our distinc donation profile clusters
cluster_id_list = cluster['ClusterID'].unique()
cluster_id_dict = {row['CID']: row['ClusterID'] for idx, row in cluster.iterrows()}
voting['ClusterID'] = voting['CID'].apply(lambda x: cluster_id_dict[x])
# A list of lists, where the ith list and jth entry is the 
# average vote of the ith cluster on jth bill
nan_count = 0
zero_count = 0
total_count = 0

def nan_to_neutral(val):
    global total_count
    global nan_count
    global zero_count
    total_count += 1
    if np.isnan(val):
        nan_count += 1
        print(1)
        return 0.5
    if val == 0:
        zero_count += 1
        return 0.001
    return val

cluster_voting = [[nan_to_neutral(np.mean(voting['vote'][(voting['BillID'] == bill_id) & (voting['ClusterID'] == cluster_id)])) for bill_id in bill_id_list] for cluster_id in cluster_id_list]
print("Clusters without a vote: ", nan_count)
print("Clusters with all `NO` votes: ", zero_count)
print("Votes: ", total_count)


Clusters without a vote:  0
Clusters with all `NO` votes:  23
Votes:  2106


## Does Donation Profile have Predictive Potential on Voting Record
Our central question is whether or not the pattern of donations recieved by a candidate has any measurable impact or correlation with their voting record. We'll first try this with a paired t-test; basically what were saying here is that voting clusters are "treatments" or our explanatory categorical variable, and we're asking if the difference in voting record between clusters could be due to chance. The paired t-test looks at each matchup (in this case each chance to vote) and determines the probability that the values are drawn from the same distribution (techincally this assumes a normal distribution, but hey, no test is perfect).

In [6]:
t_test_pv = np.empty((3, 3))
for i, cvl1 in enumerate(cluster_voting):
    for j, cvl2 in enumerate(cluster_voting):
        t_test_pv[i][j] = stats.ttest_rel(cvl1, cvl2)[1]

t_test_pv

array([[             nan,   5.10779906e-05,   4.11646508e-14],
       [  5.10779906e-05,              nan,   3.73281923e-12],
       [  4.11646508e-14,   3.73281923e-12,              nan]])

Our highest p-value is ~0.00005108 which means there is a very low probability the behavior is fundamentally similar.

In [19]:
significant = np.zeros((3, 3))
total = len(cluster_voting[0])
for i in range(len(cluster_voting[0])):
    votes = [cluster_voting[j][i] for j in range(3)]
    for a, votea in enumerate(votes):
        for b, voteb in enumerate(votes):
            if a != b:
                if np.abs(logit(votea) - logit(voteb)) > 0.05:
                    significant[a][b] += 1
print(significant)
print(significant / total)
print(total)
    
    
    

[[   0.  653.  634.]
 [ 653.    0.  675.]
 [ 634.  675.    0.]]
[[ 0.          0.93019943  0.9031339 ]
 [ 0.93019943  0.          0.96153846]
 [ 0.9031339   0.96153846  0.        ]]
702


Here we can see that at minimum, 90.3% of our votes show statistically significant differences. Because our alpha value threshold was 0.05, we would expect a false positive 5% of the time. Because the rate of statistical significance is far higher than 5%, we can be relatively confident that there are real differences.