# DF Construction and EDA

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Helper functions

In [7]:
def party_code_to_letter(party_code):
    '''
    Convert a party code from Voteview into a letter for a party in recent history
    '''
    if party_code == 100:
        return 'D'
    if party_code == 200:
        return 'R'
    if party_code == 328:
        return 'I'
    else:
        return np.nan

In [8]:
def parse_senator_names(name, name_type):
    split_name = name.split(', ', 1)
    last_name = split_name[0].split(' ')[-1].capitalize()
    first_name = split_name[1].replace(',','').split(' ')[0].capitalize()
    
    if name_type == 'last':
        return last_name
    elif name_type == 'first':
        return first_name
    else:
        return np.nan

### Making the DataFrame

In [4]:
start_congress = 113
current_congress = 116

In [22]:
#DataFrame for all votes case in 113th through current congress
votes_df = pd.DataFrame()
for i in range(start_congress, current_congress+1):
    temp_df = pd.read_csv('votes/S{}_votes.csv'.format(i))
    votes_df = pd.concat([votes_df, temp_df], ignore_index=True)
    
votes_df.drop('prob', axis=1, inplace=True)

#drop votes belonging to the vice president
votes_to_drop = votes_df[votes_df['icpsr'].isin([99911, 99912])].index
votes_df.drop(votes_to_drop, inplace=True)
        
display(votes_df.head())
display(votes_df.tail())
display(votes_df.info())

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code
0,113,Senate,1,14009,1
1,113,Senate,1,14203,1
2,113,Senate,1,14226,1
3,113,Senate,1,14230,1
4,113,Senate,1,14307,1


Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code
227354,116,Senate,508,49300,1
227355,116,Senate,508,49308,1
227356,116,Senate,508,49703,1
227357,116,Senate,508,49706,1
227358,116,Senate,508,94659,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 226585 entries, 0 to 227358
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   congress    226585 non-null  int64 
 1   chamber     226585 non-null  object
 2   rollnumber  226585 non-null  int64 
 3   icpsr       226585 non-null  int64 
 4   cast_code   226585 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 10.4+ MB


None

In [23]:
#make DataFrame of all senators from starting congress to current congress
member_df = pd.DataFrame()
for i in range(start_congress, current_congress+1):
    temp_df = pd.read_csv('members/S{}_members.csv'.format(i))
    member_df = pd.concat([member_df, temp_df], ignore_index=True)

#get rid of presidents from senate list
president_indexes = member_df[member_df['chamber'] == 'President'].index.values
member_df.drop(president_indexes, inplace=True)

#change the party code to a letter (Republican - R, Democrat - D, Independent - I)
member_df['party'] = member_df['party_code'].apply(lambda x: party_code_to_letter(x))

#separate out last name, first name, and middle names/suffixes
member_df['last_name'] = member_df['bioname'].apply(lambda x: parse_senator_names(x, 'last'))
member_df['first_name'] = member_df['bioname'].apply(lambda x: parse_senator_names(x, 'first'))

cols_to_drop = ['chamber', 'state_icpsr', 'district_code', 'party_code', 'occupancy', 'last_means', 'died', 
                'nominate_log_likelihood', 'nominate_geo_mean_probability', 'nominate_number_of_votes',
                'nominate_number_of_errors', 'conditional', 'nokken_poole_dim1', 'nokken_poole_dim2']
member_df.drop(cols_to_drop, axis=1, inplace=True)

display(member_df.head())
display(member_df.tail())
display(member_df.info())

Unnamed: 0,congress,icpsr,state_abbrev,bioname,bioguide_id,born,nominate_dim1,nominate_dim2,party,last_name,first_name
1,113,49700,AL,"SESSIONS, Jefferson Beauregard III (Jeff)",S001141,1946,0.549,0.13,R,Sessions,Jefferson
2,113,94659,AL,"SHELBY, Richard C.",S000320,1934,0.428,0.514,R,Shelby,Richard
3,113,40300,AK,"MURKOWSKI, Lisa",M001153,1957,0.21,-0.289,R,Murkowski,Lisa
4,113,40900,AK,"BEGICH, Mark",B001265,1962,-0.235,0.15,D,Begich,Mark
5,113,15039,AZ,"McCAIN, John Sidney, III",M000303,1936,0.381,-0.626,R,Mccain,John


Unnamed: 0,congress,icpsr,state_abbrev,bioname,bioguide_id,born,nominate_dim1,nominate_dim2,party,last_name,first_name
408,116,40915,WV,"MANCHIN, Joe, III",M001183,1947,-0.055,0.444,D,Manchin,Joe
409,116,29940,WI,"BALDWIN, Tammy",B001230,1962,-0.498,-0.179,D,Baldwin,Tammy
410,116,41111,WI,"JOHNSON, Ron",J000293,1955,0.603,-0.288,R,Johnson,Ron
411,116,40707,WY,"BARRASSO, John A.",B001261,1952,0.539,0.237,R,Barrasso,John
412,116,49706,WY,"ENZI, Michael B.",E000285,1944,0.544,0.192,R,Enzi,Michael


<class 'pandas.core.frame.DataFrame'>
Int64Index: 411 entries, 1 to 412
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   congress       411 non-null    int64  
 1   icpsr          411 non-null    int64  
 2   state_abbrev   411 non-null    object 
 3   bioname        411 non-null    object 
 4   bioguide_id    411 non-null    object 
 5   born           411 non-null    int64  
 6   nominate_dim1  411 non-null    float64
 7   nominate_dim2  411 non-null    float64
 8   party          411 non-null    object 
 9   last_name      411 non-null    object 
 10  first_name     411 non-null    object 
dtypes: float64(2), int64(3), object(6)
memory usage: 38.5+ KB


None

In [24]:
votes_and_members_df = votes_df.merge(member_df, how='left', on=['congress', 'icpsr'])

display(votes_and_members_df.head())
display(votes_and_members_df.info())

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,state_abbrev,bioname,bioguide_id,born,nominate_dim1,nominate_dim2,party,last_name,first_name
0,113,Senate,1,14009,1,MS,"COCHRAN, William Thad",C000567,1937,0.287,0.051,R,Cochran,William
1,113,Senate,1,14203,1,MT,"BAUCUS, Max Sieben",B000243,1941,-0.212,0.017,D,Baucus,Max
2,113,Senate,1,14226,1,IA,"GRASSLEY, Charles Ernest",G000386,1933,0.346,-0.065,R,Grassley,Charles
3,113,Senate,1,14230,1,IA,"HARKIN, Thomas Richard (Tom)",H000206,1939,-0.351,-0.546,D,Harkin,Thomas
4,113,Senate,1,14307,1,VT,"LEAHY, Patrick Joseph",L000174,1940,-0.361,-0.129,D,Leahy,Patrick


<class 'pandas.core.frame.DataFrame'>
Int64Index: 226585 entries, 0 to 226584
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   congress       226585 non-null  int64  
 1   chamber        226585 non-null  object 
 2   rollnumber     226585 non-null  int64  
 3   icpsr          226585 non-null  int64  
 4   cast_code      226585 non-null  int64  
 5   state_abbrev   226585 non-null  object 
 6   bioname        226585 non-null  object 
 7   bioguide_id    226585 non-null  object 
 8   born           226585 non-null  int64  
 9   nominate_dim1  226585 non-null  float64
 10  nominate_dim2  226585 non-null  float64
 11  party          226585 non-null  object 
 12  last_name      226585 non-null  object 
 13  first_name     226585 non-null  object 
dtypes: float64(2), int64(5), object(7)
memory usage: 25.9+ MB


None