# Data Engineering for Databases

In [30]:
import numpy as np
import pandas as pd
import pickle

In [91]:
with open('all_votes.p', 'rb') as f:
    votes = pickle.load(f)
    
with open('temp_senators.p', 'rb') as f:
    senators = pickle.load(f)

## Preparing data for database insertion

### Table for senate members
- SEN_ID TEXT PRIMARY key NOT NULL
- F_NAME TEXT NOT NULL
- L_NAME TEXT NOT NULL
- PARTY TEXT NOT NULL
- GENDER TEXT NOT NULL
- STATE TEXT NOT NULL

In [92]:
s_df = pd.DataFrame(senators)
s_df.rename(columns={'id': 'sen_id'}, inplace=True)

In [93]:
# Already in format for insertion into SQL database
s_df.head()

Unnamed: 0,sen_id,first_name,last_name,party,gender,state
0,A000031,Brockman,Adams,D,,WA
1,A000069,Daniel,Akaka,D,M,HI
2,A000219,William,Armstrong,R,M,CO
3,B000243,Max,Baucus,D,M,MT
4,B000401,Lloyd,Bentsen,D,,TX


#### Check gender self-identification
Determine senator gender self-identification to replace missing values: F - Female, M - Male, N - Non-Binary.

In [94]:
# Deal with gender NaN values
s_df.loc[s_df['gender'].isna()]

Unnamed: 0,sen_id,first_name,last_name,party,gender,state
0,A000031,Brockman,Adams,D,,WA
4,B000401,Lloyd,Bentsen,D,,TX
9,B000647,Rudolph,Boschwitz,R,,MN
14,B001077,Quentin,Burdick,D,,ND
22,C000877,Alan,Cranston,D,,CA
27,D000366,Alan,Dixon,D,,IL
34,F000329,Wyche,Fowler,D,,GA
49,H000951,Gordon,Humphrey,R,,NH
66,M000250,Spark,Matsunaga,D,,HI
79,P000513,Larry,Pressler,R,,SD


In [95]:
gender = {0: 'M', 4: 'M', 9: 'M', 14: 'M', 22: 'M', 27: 'M', 34: 'M', 49: 'M', 66: 'M', 79: 'M', 95: 'M'}

In [96]:
# Correct data
for k, v in gender.items():
    s_df.iloc[k]['gender'] = v

### Table for bills and table for votes
#### Bills
- CSR_ID TEXT PRIMARY key NOT NULL (unique primary key will be constructed from 'congress.session.roll_call')
- CONGRESS INT NOT NULL
- SESSION INT NOT NULL
- DATE DATE NOT NULL
- ROLL_CALL NOT NULL

#### Votes
- ID INT PRIMARY KEY NOT NULL AUTO INCREMENT
- SEN_ID TEXT FOREIGN key NOT NULL
- CSR_ID TEXT FOREIGN key NOT NULL
- POSITION TEXT NOT NULL

In [99]:
# Some 'bills' are not bills but confirmations, treaty votes, etc. and will be dropped
votes[0]['bill_id']

'-101'

In [100]:
# Integer bill_ids are codes for special votes as mentioned above
new_votes = []
for vote in votes:
    try:
        int(vote['bill_id'])
    except:
        new_votes.append(vote)

In [101]:
print(len(votes))
print(len(new_votes))

8949
8401


In [102]:
# List of bills for bills table
bills = []

# List of votes for votes table
sen_votes = []

for vote in new_votes:
    vote['csr_id'] = f'{vote["congress"]}.{vote["session"]}.{vote["roll_call"]}'
    bill = {
        'csr_id': vote['csr_id'],
        'congress': vote['congress'],
        'session': vote['session'],
        'roll_call': vote['roll_call'],
        'bill_id': vote['bill_id'],
        'date': vote['date']
    }
    bills.append(bill)
    
    for position in vote['positions']:
        p = {
            'sen_id': position['member_id'],
            'csr_id': vote['csr_id'],
            'position': position['vote_position']
        }
        sen_votes.append(p)

In [103]:
pd.DataFrame(sen_votes)

Unnamed: 0,sen_id,csr_id,position
0,A000031,101.1.11,Yes
1,A000219,101.1.11,Yes
2,B000243,101.1.11,Yes
3,B000401,101.1.11,Yes
4,B000444,101.1.11,Yes
...,...,...,...
839168,W000817,116.2.140,No
839169,W000802,116.2.140,Yes
839170,W000437,116.2.140,Yes
839171,W000779,116.2.140,No


In [106]:
pd.DataFrame(bills)

Unnamed: 0,csr_id,congress,session,roll_call,bill_id,date
0,101.1.11,101,1,11,s.j.res.7-101,1989-02-02
1,101.1.12,101,1,12,h.j.res.129-101,1989-02-07
2,101.1.14,101,1,14,s.res.66-101,1989-02-28
3,101.1.15,101,1,15,s.res.66-101,1989-02-28
4,101.1.24,101,1,24,s.20-101,1989-03-16
...,...,...,...,...,...,...
8396,116.2.136,116,2,136,s4049-116,2020-07-22
8397,116.2.137,116,2,137,s4049-116,2020-07-22
8398,116.2.138,116,2,138,s4049-116,2020-07-23
8399,116.2.139,116,2,139,s4049-116,2020-07-23
