# EDA for Dashboard

In [2]:
import numpy as np
import pandas as pd
import pickle
import re

import psycopg2
import configparser
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import plotly.graph_objects as go
import plotly.express as px

In [None]:
# Set up connection to AWS RDS
config1 = configparser.ConfigParser()
config1.read('../config.ini')
ENDPOINT = config1.get('aws', 'ENDPOINT')
PORT = config1.get('aws', 'PORT')
USR = config1.get('aws', 'USER')
PWD = config1.get('aws', 'PASSWORD')
DB = config1.get('aws', 'DATABASE')

In [None]:
# Read csv file to DataFrame
df = pd.read_csv('votes.csv', index_col=0)

# Convert 'not voting' and 'present' votes to NaN values
df = pd.DataFrame(np.where(df == 0, np.nan, df), index=df.index, columns=df.columns)

## Processing Data for Analysis
Metrics for voting similarity (how often 2 senators are in agreement) and principal component analysis visualizations will be created.  In this section, the EC2 instance will process the information and then pass the data in the form of csv files to be read by the Heroku app.

### Senator similarity by agreement/(disagree+agree) over the past 20 years

In [None]:
# Function to return agreement/total votes
def vote_sim(v1, v2):
    return sum(abs(abs(v1 - v2)/2 - 1)) / len(v1)

In [None]:
# Create matrix for voting similarity between senators
def similarity_matrix(df):
    senators = list(df.index)
    l = len(senators)

    sim_mat = np.zeros((l,l))
    for i in range(l):
        for j in range(l):
            if i != j:
                temp = df.loc[[senators[i], senators[j]]].dropna(axis=1)
                v1 = temp.loc[senators[i]]
                v2 = temp.loc[senators[j]]
                sim = vote_sim(v1, v2)
                sim_mat[i][j] = round(sim, 2)
            else:
                sim_mat[i][j] = np.nan
    return sim_mat

In [None]:
sim_mat = similarity_matrix(df)
sim_df = pd.DataFrame(sim_mat, index=df.index, columns=df.index)

In [None]:
sim_df.head()

In [None]:
# Write voting similarity matrix as csv
sim_df.to_csv('voting_sim.csv')

In [None]:
# Retrieve most recent congress number
def get_congress_number():
    conn = psycopg2.connect(
        host=ENDPOINT,
        user=USR,
        password=PWD,
        port=PORT,
        database=DB
    )

    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cursor = conn.cursor()
    cursor.execute(
        """
        SELECT MAX(congress) FROM bills
        ;
        """
    )
    
    current_congress = cursor.fetchone()[0]
    cursor.close()
    conn.close()
    return current_congress

In [None]:
# Get list of bills from congress (by current of congress number)
def get_bills_list(cong_number, current=True):
    if current == True:
        congress = get_congress_number()
    else:
        congress = cong_number
        
    bills = []
    for col in list(df.columns):
        if re.search(f'^{congress}', col):
            bills.append(col)
    return bills

In [None]:
bills = get_bills_list(_)
df_116 = df[bills].copy(deep=True)

### Dealing with NaN values for EDA and Data Viz
In the current congress (116th) there have been (as of August 6th, 2020) 179 bills voted on.  It is possible to deal with NaN values by dropped a couple of bills because of non-participating members or having 0's appended to the NaN values.  However, 1 senator had only started in the 2nd session of the current congress (with 72 total votes thus far) and must be dropped in order to perform meaningful principal component analysis (PCA) and similarity comparisons for this particular congress.

In [None]:
df_116.T.describe() # Drop Kelly Loeffler, only started in 2nd session of 116th Congress

In [None]:
df_116.drop(index='Kelly Loeffler', inplace=True)

In [None]:
sim_116 = pd.DataFrame(similarity_matrix(df_116), index=df_116.index, columns=df_116.index)

In [None]:
# Convert NaN values to 0
df_116 = pd.DataFrame(np.where(df_116.isna() == True, 0, df_116), index=df_116.index, columns=df_116.columns)

### Getting Senator Info

In [None]:
# Function to get senator information
def senator_info():
    conn = psycopg2.connect(
        host=ENDPOINT,
        user=USR,
        password=PWD,
        port=PORT,
        database=DB
    )

    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cursor = conn.cursor()
    cursor.execute(
        """
        SELECT * FROM senators
        ;
        """
    )
    
    senator_info = cursor.fetchall()
    cursor.close()
    conn.close()
    return senator_info

In [None]:
sen_info = senator_info() # sen_id, f_name, l_name, party, gender, state

## Clustering and PCA for visualization

In [None]:
# Number of bills voted on per senator
sen_length = []
for sen in list(df_116.index):
    l = len(df.loc[sen].dropna())
    sen_length.append(l)
    
sen_length = pd.Series(sen_length)
sen_length.name = 'voting_length'

In [None]:
# 2 component dimensionality
pca = PCA(2)
X = pca.fit_transform(df_116)

In [None]:
# Clustering algorithm
cl_algo = KMeans(5)
labels = cl_algo.fit_predict(df_116)
sen_name = pd.Series(df_116.index)

In [None]:
# DataFrame for x, y data
sen_name.name = 'name'
df_xy = pd.DataFrame(X, columns=['x', 'y']).join(sen_name)
df_xy.set_index('name', inplace=True)

In [None]:
# DataFrame for labels
df_lbs = pd.DataFrame([sen_name, labels]).T
df_lbs.columns = ['name', 'label']
df_lbs.set_index('name', inplace=True)

In [None]:
# DataFrame for senator info
df_plot = pd.DataFrame(sen_info)
df_plot['name'] = df_plot[1] + ' ' + df_plot[2]
df_plot.drop(columns=[0, 1, 2], inplace=True)
df_plot.columns = ['party', 'gender', 'state', 'name']
df_plot.set_index('name', inplace=True)

# Drop Kelly Loeffler, change Richard Shelby to Republican
df_plot.drop('Kelly Loeffler', inplace=True)
df_plot.loc['Richard Shelby']['party'] = 'R'

In [None]:
# Merge x, y data
df_plot = df_plot.join(df_xy, on='name').join(df_lbs, on='name')
df_plot.reset_index(inplace=True)

In [None]:
# Merge length of voting record
df_plot = df_plot.join(sen_length)

In [None]:
# Create cluster names
df_plot['cluster'] = np.nan
for i in list(df_plot['label'].unique()):
    temp = df_plot.loc[df_plot['label'] == i]
    cluster = temp.loc[temp['voting_length'] == max(temp['voting_length'])]['name']
    df_plot['cluster'] = np.where(df_plot['label'] == i, cluster + ' Cluster', df_plot['cluster'])

## Selection functions for Data Viz
In this section we will be selecting senators based on party, gender, and state.  These functions will be implemented in the Heroku app.

In [3]:
sim_df = pd.read_csv('voting_sim.csv', index_col=0)
cur_sim_df = pd.read_csv('vs_current.csv', index_col=0)
data_df = pd.read_csv('sen_data.csv', index_col=0)
with open('sen_info.p', 'rb') as f:
    sen_info = pickle.load(f)

In [4]:
# Function for filtering senators
def sen_by_q(senator_info, party=None, gender=None, state=None):
    senators = []
    if party == None:
        party = ['R', 'D', 'ID']
    if gender == None:
        gender = ['M', 'F', 'N']
    if type(party) != list:
        party = [party]
    if type(gender) != list:
        gender = [gender]
        
    for senator in senator_info:
        sen_name = f'{senator[1]} {senator[2]}'
        sen_party = senator[3]
        sen_gender = senator[4]
        sen_state = senator[5]
        if state == None:
            if sen_party in party and sen_gender in gender:
                senators.append(sen_name)
        else:
            if type(state) != list:
                state = [state]
            if sen_party in party and sen_gender in gender and sen_state in state:
                senators.append(sen_name)
    return senators

In [5]:
# Function for selected senator similarity
def selected_senator_sim(sim_df, senator):
    sen_sim = sim_df.loc[senator]
    sen_list = list(sen_sim.index)
    sen = [ (sen_list[i], sen_sim[i]) for i in range(len(sen_list)) if sen_list[i] != senator ]
    sim_list = sorted(sen, key=lambda x: x[1])
    least = sim_list[:10]
    most = sim_list[::-1][:10]
    return least, most

In [6]:
# Similarity over the past 20 years
least, most = selected_senator_sim(sim_df, 'Bernard Sanders')

In [9]:
# Similarity in the 116th Congress
least, most = selected_senator_sim(cur_sim_df, 'Bernard Sanders')

## Data Viz

### Functions for PCA and Similarity Plots

In [12]:
def pca_plot(df):
    fig = px.scatter(
        df,
        x='x',
        y='y',
        color='party',
        hover_name='name',
        hover_data=['party', 'state'],
        color_discrete_map={'R': 'red', 'D': 'blue', 'ID': 'light green'},
        size='voting_length',
        symbol='cluster',
        width=800,
        height=650
    )
    return fig

In [95]:
def sim_plot(df, senator):
    least, most = selected_senator_sim(df, senator)
    X1 = [ sen[1] for sen in least ]
    Y1 = [ sen[0] for sen in least ]
    X2 = [ sen[1] for sen in most ]
    Y2 = [ sen[0] for sen in most ]
    
    fig1 = go.Figure(go.Bar(
        x=X1,
        y=Y1,
        orientation='h',
        hovertemplate=
        'Senator: %{y}' +
        '<br>Percent Agreement: %{x}%<extra></extra>',
    ))
    fig1.update_layout(yaxis={'categoryorder':'total descending'}, height=600, width=700)
    fig1.update_layout(title='Bottom 10: Least Similar Senators', hoverlabel_align = 'right')
    
    fig2 = go.Figure(go.Bar(
        x=X2, 
        y=Y2, 
        orientation='h',
        hovertemplate=
        'Senator: %{y}' +
        '<br>Percent Agreement: %{x}%<extra></extra>',
    ))
    fig2.update_layout(yaxis={'categoryorder':'total ascending'}, height=600, width=700)
    fig2.update_layout(title='Top 10: Most Similar Senators', hoverlabel_align = 'right')
    return fig1, fig2

In [96]:
fig1, fig2 = sim_plot(cur_sim_df, 'Bernard Sanders')

In [98]:
fig1