In [52]:
# Data Visualization
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# Config Reader
import configparser

# Database Connection
import firebase_admin
from firebase_admin import credentials, firestore

In [2]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('./auth/config.ini')

# Get Google Firebase Auth
GCP_AUTH_PATH = config.get('firebase', 'GCP_AUTH_PATH')
cred = credentials.Certificate(GCP_AUTH_PATH)
app = firebase_admin.initialize_app(cred)

# Instantiate connection to database
db = firestore.client()

In [3]:
# Create collection references
reps_ref = db.collection("reps")
edu_ref = db.collection("edu")
votes_ref = db.collection("votes")

In [267]:
# Pull educational and representative data from database
degrees = pd.DataFrame([ doc.to_dict() for doc in edu_ref.get() ])
reps = pd.DataFrame([ doc.to_dict() for doc in reps_ref.get() ])

In [268]:
# Clean degree strings (ex. 'J.D.' -> 'JD')
degrees['degree'] = degrees['degree'].map(lambda x: ''.join(x.split('.')))

In [269]:
# Dictionary to bin degrees
cc_dict = {
    'Associates': ['AAS', 'AS', 'AA'],
    'Bachelors': ['BS', 'BA', 'SB', 'AB', 'BDiv', 'BBA', 'BEng', 'BM', 'ALB', 'BSN', 'BGS', 'BPA', 'BSBA', 'LLB'],
    'High School': ['HS'],
    'JD': ['JD'],
    'Masters - General': ['MA', 'MS', 'SM', 'MSc', 'MFA', 'MAcc'],
    'Masters - Public': ['MIA', 'MPA', 'MUP', 'MPP', 'MSW', 'MSS', 'MPH', 'MHS'],
    'Masters - Education': ['MEd', 'SYC'],
    'Masters - Law': ['LLM'],
    'Masters - Theology': ['MDiv', 'ThM'],
    'MBA': ['MBA', 'MSEM'],
    'PHD': ['PhD'],
    'Veterinary': ['DVM'],
    'Dental': ['DDS', 'DMD'],
    'MD': ['MD', 'DPM'],
    'PHD - Education': ['EdD'],
    'PHD - Theology': ['DMin'],
    'PHD - Public': ['DPA'],
    'Nursing': ['MSN'],
}

In [270]:
# Bin degrees
x = degrees['degree']
cond_list = []
choice_list = []
for k, vs in cc_dict.items():
    for v in vs:
        cond_list.append(x == v)
        choice_list.append(k)

degrees['degree_group'] = np.select(cond_list, choice_list)

In [388]:
# Data groupby
institutions = degrees.groupby('_id')['institution'].apply(list)
deg_groups = degrees.groupby('_id')['degree_group'].apply(list)
degs = degrees.groupby('_id')['degree'].apply(list)

In [402]:
# Create merged DataFrame
df = reps[['_id', 'current_party', 'first_name', 'middle_name', 'last_name', 'dob', 'gender', 'congresses']]
df = df.merge(institutions, how='left', on='_id')
df = df.merge(deg_groups, how='left', on='_id')
df = df.merge(degs, how='left', on='_id')

# Replace null values
df['middle_name'] = np.where(df['middle_name'].isna(), '', df['middle_name'])

# Add 'in_office' column
df['in_office'] = df['congresses'].map(set(['117']).issubset)

In [405]:
df.loc[df['degree'].isna()]

Unnamed: 0,_id,current_party,first_name,middle_name,last_name,dob,gender,congresses,institution,degree_group,degree
19,B001224,D,Cori,,Bush,1976-07-21 00:00:00+00:00,F,[117],,,
49,B001306,R,Troy,,Balderson,1962-01-16 00:00:00+00:00,M,"[117, 116, 115]",,,
209,H001084,R,Yvette,,Herrell,1964-03-16 00:00:00+00:00,F,[117],,,
273,L000576,R,Billy,,Long,1955-08-11 00:00:00+00:00,M,"[117, 116, 115, 114, 113, 112]",,,
339,N000179,D,Grace,F.,Napolitano,1936-12-04 00:00:00+00:00,F,"[117, 116, 115, 114, 113, 112, 111, 110, 109, ...",,,
435,S001199,R,Lloyd,,Smucker,1964-01-23 00:00:00+00:00,M,"[117, 116, 115]",,,


In [381]:

df = .merge(reps, how='left', on='_id')
df.sort_values('_id', inplace=True)



In [380]:
# Create groupby DataFrame
df1 = df.groupby(
    ['_id', 'first_name', 'middle_name', 'last_name', 'current_party', 'dob', 'gender', 'in_office']
)['degree_group'].apply(list).reset_index()
df.rename(mapper={'degree_group': 'degrees'}, axis=1, inplace=True)

In [376]:
# Check results
df['in_office'].sum() # Number of US Reps in office: 435

435

In [377]:
df

Unnamed: 0,_id,first_name,middle_name,last_name,current_party,dob,gender,in_office,0
0,A000055,Robert,B.,Aderholt,R,1965-07-22 00:00:00+00:00,M,True,"[degree_group, institution]"
1,A000148,Jake,,Auchincloss,D,1988-01-29 00:00:00+00:00,M,True,"[degree_group, institution]"
2,A000367,Justin,,Amash,I,1980-04-18 00:00:00+00:00,M,False,"[degree_group, institution]"
3,A000369,Mark,,Amodei,R,1958-06-12 00:00:00+00:00,M,True,"[degree_group, institution]"
4,A000370,Alma,,Adams,D,1946-05-27 00:00:00+00:00,F,True,"[degree_group, institution]"
...,...,...,...,...,...,...,...,...,...
497,W000827,Ron,,Wright,R,1953-04-08 00:00:00+00:00,M,True,"[degree_group, institution]"
498,Y000033,Don,,Young,R,1933-06-09 00:00:00+00:00,M,True,"[degree_group, institution]"
499,Y000062,John,,Yarmuth,D,1947-11-04 00:00:00+00:00,M,True,"[degree_group, institution]"
500,Y000065,Ted,,Yoho,R,1955-04-13 00:00:00+00:00,M,False,"[degree_group, institution]"


In [369]:
class QueryData():
    '''
    Query DataFrame Object, allowing list object queries
    
    Parameters
    ----------
    df - Pandas DataFrame
    '''
    
    def __init__(self, df):
        self.data = df.reset_index(drop=True)
        
    def query_pipeline(self, pipeline=[]):
        '''
        Query data with pipeline
        
        Parameters
        ----------
        pipeline - list of comparison tuples (field, value)
        
        Returns
        -------
        DataFrame with query results
        '''
        
        result = self.data
        mask = None
        for c, v in pipeline:
            if type(self.data[c][0]) == list:
                if type(v) != list:
                    v = [v]
                mask = result[c].map(set(v).issubset)
                result = result[mask].reset_index(drop=True)
            else:
                mask = result[c].map(lambda x: x==v)
                result = result[mask].reset_index(drop=True)

        return result
    
    def get_col_vals(self, columns=[], sort=False):
        '''
        Get list of possible values for query
        
        Parameters
        ----------
        columns - list of query columns
        sort - bool to return sorted lists
        
        Returns
        -------
        Dictionary of field and list of values
        '''
        
        values = {}
        for c in columns:
            if type(self.data[c][0]) == list:
                vals = []
                for val in self.data[c]:
                    for v in val:
                        if v not in vals:
                            vals.append(v)
                if sort:
                    vals = sorted(vals)
                values[c] = vals
                
            else:
                vals = list(self.data[c].unique())
                if sort:
                    vals = sorted(vals)
                values[c] = vals
                
        return values

In [370]:
query = QueryData(df)

In [371]:
query.get_col_vals(['degrees', 'in_office', 'gender'], sort=True)

{'degrees': ['Associates',
  'Bachelors',
  'Dental',
  'High School',
  'JD',
  'MBA',
  'MD',
  'Masters - Education',
  'Masters - General',
  'Masters - Law',
  'Masters - Public',
  'Masters - Theology',
  'Nursing',
  'PHD',
  'PHD - Education',
  'PHD - Public',
  'PHD - Theology',
  'Veterinary'],
 'in_office': [False, True],
 'gender': ['F', 'M']}

In [328]:
query.query_pipeline([('degrees', ''), ('gender', 'M')])

Unnamed: 0,_id,first_name,middle_name,last_name,current_party,dob,gender,in_office,degrees
0,B001307,James,,Baird,R,1945-06-04 00:00:00+00:00,M,True,"[PHD, Bachelors, Masters - General]"
1,C001053,Tom,,Cole,R,1949-04-28 00:00:00+00:00,M,True,"[PHD, Bachelors, Masters - General]"
2,C001063,Henry,,Cuellar,D,1955-09-19 00:00:00+00:00,M,True,"[Masters - General, Bachelors, Associates, PHD..."
3,D000096,Danny,K.,Davis,D,1941-09-06 00:00:00+00:00,M,True,"[Masters - General, PHD, Bachelors]"
4,F000454,Bill,,Foster,D,1955-10-07 00:00:00+00:00,M,True,"[Masters - General, Bachelors, PHD]"
5,G000579,Mike,,Gallagher,R,1984-03-03 00:00:00+00:00,M,True,"[PHD, Masters - General, Bachelors, Masters - ..."
6,K000381,Derek,,Kilmer,D,1974-01-01 00:00:00+00:00,M,True,"[Masters - General, PHD, Bachelors]"
7,L000563,Daniel,,Lipinski,D,1966-07-15 00:00:00+00:00,M,False,"[Bachelors, PHD, Masters - General]"
8,L000565,Dave,,Loebsack,D,1952-12-23 00:00:00+00:00,M,False,"[Bachelors, Masters - General, PHD]"
9,L000579,Alan,,Lowenthal,D,1941-03-08 00:00:00+00:00,M,True,"[Masters - General, PHD, Bachelors]"
