In [48]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
%matplotlib inline

# Vote History Model
This model will primarily focus on predicting voting behaviour based on previous voting behavior. Specifically, it will look at:
1. the percent that a member votes with their party in a current session (up to the vote in question)
2. The percent that the member and cosponsors vote together in the current session (up to current vote)
3. Repeat points 1 and 2, but with a specific committee

## Feature Construction

In [5]:
def get_full_set():
    for dataset in ['train','dev','test']:
        df = pd.read_csv('../data/model/' + dataset + '.csv', encoding = 'latin1')
        if dataset == 'train':
            df_votes = df
        else:
            df_votes = pd.concat([df_votes, df])
    return df_votes
df_votes = get_full_set()
df_votes['s_roll'] = df_votes.session * 10000 + df_votes.roll_call
df_votes['voted'] = 1
df_votes.sort_values(['congress', 's_roll'], inplace=True)
df_votes.set_index('full_set_id', inplace=True)

### Cum Avg of Vote with Party Percent

#### Overall

In [35]:
df_party = df_votes.groupby(['congress','member_id']).cumsum()[['broke_from_party','voted']]
df_party['broke_cum_pct'] = df_party.broke_from_party / df_party.voted
df_party.columns = ["party_" + x for x in df_party.columns]

In [47]:
df_party.tail()

Unnamed: 0_level_0,party_broke_from_party,party_voted,party_broke_cum_pct
full_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1933669,0,223,0.0
1933751,1,223,0.004484
1933912,5,216,0.023148
1933633,0,211,0.0
1933681,0,221,0.0


In [46]:
df_party.party_broke_cum_pct.describe() * 100

count    1.940619e+08
mean     5.559150e+00
std      6.369770e+00
min      0.000000e+00
25%      1.098901e+00
50%      3.735632e+00
75%      7.926829e+00
max      1.000000e+02
Name: party_broke_cum_pct, dtype: float64

In [84]:
df_votes.head()

Unnamed: 0_level_0,member_id,party,vote_position,congress,roll_call,chamber,session,bill_id,majority_pos_rep,majority_pos_dem,broke_from_party,s_roll,voted
full_set_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
62,L000447,R,Yes,101.0,12.0,Senate,1.0,hjres129-101,Yes,Yes,0,10012.0,1
10,B000780,D,Yes,101.0,12.0,Senate,1.0,hjres129-101,Yes,Yes,0,10012.0,1
33,F000329,D,Yes,101.0,12.0,Senate,1.0,hjres129-101,Yes,Yes,0,10012.0,1
53,K000019,R,Yes,101.0,12.0,Senate,1.0,hjres129-101,Yes,Yes,0,10012.0,1
27,D000388,D,No,101.0,12.0,Senate,1.0,hjres129-101,Yes,Yes,1,10012.0,1


In [108]:
roll_calls = df_votes[df_votes.congress == 112].s_roll.unique()

In [114]:
percentiles = [.05, .1, .25,.50,.75]
j = [int(len(roll_calls) * x) for x in percentiles]
roll_calls[j]

array([ 10039.,  10375.,  10852.,  20195.])

### Vote with Cosponsors

Construct cumulative vote concurrence between all members of congress. Can't think of a great way to do this without looping :-(

In [149]:
(1.0, 'senate') == (1, 'senate')

True

In [211]:
def vote_concurrence(df):
    """To save on memory. Only record every """
    cumulative = defaultdict(lambda: defaultdict(lambda: (defaultdict(int))))
    vote_conc = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: (defaultdict(int)))))
    
    check_points = {}
    roll_calls = df.groupby(['congress', 'chamber']).s_roll.agg(lambda x: set(x))
    for i, value in roll_calls.iteritems():
        cp = np.sort(np.asarray(list((value))))
        percentiles = [.05, .1, .25,.50,.75]
        j = [int(len(cp) * x) for x in percentiles]
        cp = list(cp[j])
        check_points[i] = cp
        

    for i1, m1 in df.iterrows():
        for i2, m2 in df[df.bill_id == m1.bill_id].iterrows():
            if m1.member_id == m2.member_id: 
                continue
            total_votes = cumulative[int(m1.congress)][(m1.member_id, m2.member_id)]['total']
            votes_incommon = cumulative[int(m1.congress)][(m1.member_id, m2.member_id)]['together']
            cumulative[int(m1.congress)][(m1.member_id, m2.member_id)]['total'] += 1
            if m1.vote_position == m2.vote_position:
                cumulative[int(m1.congress)][(m1.member_id, m2.member_id)]['together'] += 1
            
            # for early in congress, record votes every 10 roll calls
            
            if m1.s_roll in check_points[(m1.congress, m1.chamber)]:
                vote_conc[int(m1.congress)][(m1.chamber, m1.s_roll)][(m1.member_id, m2.member_id)]['together'] = votes_incommon
                vote_conc[int(m1.congress)][(m1.chamber, m1.s_roll)][(m1.member_id, m2.member_id)]['total'] = total_votes
    return vote_conc
                    
            


In [212]:
member_votes_incommon = vote_concurrence(df_votes)

KeyboardInterrupt: 

In [None]:
print("im done")

In [210]:
member_votes_incommon

congress  chamber
101.0     Senate     {10051.0, 10044.0, 10024.0, 10025.0, 10057.0, ...
Name: s_roll, dtype: object

In [178]:
member_votes_incommon[101].keys()

dict_keys([('Senate', 10012.0), ('Senate', 10024.0), ('Senate', 10025.0), ('Senate', 10044.0), ('Senate', 10062.0)])

In [181]:
check_points = vote_concurrence(df_votes)

In [175]:
10024.0 in check_points[(101, 'Senate')]

False

In [187]:
check_points

{(101.0, 'Senate'): [10044.0, 10065.0, 10205.0, 20002.0, 20204.0],
 (102.0, 'House'): [10021.0, 10025.0, 10072.0, 20092.0, 20122.0],
 (102.0, 'Senate'): [10020.0, 10045.0, 20062.0, 20067.0, 20100.0],
 (103.0, 'House'): [10158.0, 10217.0, 10439.0, 20007.0, 20259.0],
 (103.0, 'Senate'): [10114.0, 10158.0, 10281.0, 20034.0, 20133.0],
 (104.0, 'House'): [10145.0, 10218.0, 10499.0, 10836.0, 20182.0],
 (104.0, 'Senate'): [10064.0, 10115.0, 10295.0, 10588.0, 20158.0],
 (105.0, 'House'): [10048.0, 10078.0, 10248.0, 10533.0, 20083.0],
 (105.0, 'Senate'): [10024.0, 10031.0, 10071.0, 10201.0, 20037.0],
 (106.0, 'House'): [10020.0, 10033.0, 10120.0, 10362.0, 10608.0],
 (106.0, 'Senate'): [10026.0, 10028.0, 10067.0, 10168.0, 10325.0],
 (107.0, 'House'): [10031.0, 10063.0, 10196.0, 10498.0, 20229.0],
 (107.0, 'Senate'): [10012.0, 10047.0, 10275.0, 10313.0, 20130.0],
 (108.0, 'House'): [10030.0, 10047.0, 10085.0, 10214.0, 10288.0],
 (108.0, 'Senate'): [10028.0, 10030.0, 10043.0, 10142.0, 10220.0],
 (

In [98]:
n = 10901
rem = n % 100
n - rem

10900

In [51]:
defaultdict(int)

TypeError: first argument must be callable or None

In [54]:
defaultdict(defaultdict)

defaultdict(collections.defaultdict, {})

In [71]:
test2 = defaultdict(lambda: defaultdict(lambda: (defaultdict(int))))

In [64]:
test = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: (defaultdict(int)))))

In [61]:
test = defaultdict(lambda: defaultdict(lambda: defaultdict(list(int, int))))

In [65]:
test['115']['bill'][('mem1','mem2')]['votes'] += 1

In [91]:
test2['115'][('mem1','mem2')]['total'] +=1

In [93]:
a = test2['115'][('mem1','mem2')]['total']

In [94]:
a

3

In [73]:
test2

defaultdict(<function __main__.<lambda>>,
            {'115': defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
                         {('mem1', 'mem2'): defaultdict(int, {'total': 1})})})

In [67]:
test['115']['bill'][('mem1','mem2')]['together'] += 1

In [82]:
test['115']['bill'][('mem2','mem3')]['together'] += a

In [83]:
test

defaultdict(<function __main__.<lambda>>,
            {'115': defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
                         {'bill': defaultdict(<function __main__.<lambda>.<locals>.<lambda>.<locals>.<lambda>>,
                                      {('mem1', 'mem2'): defaultdict(int,
                                                   {'together': 1,
                                                    'votes': 1}),
                                       ('mem2', 'mem3'): defaultdict(int,
                                                   {'together': 4})})})})