# EDA for Dashboard

In [48]:
import numpy as np
import pandas as pd
import re

import psycopg2
import configparser
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import plotly.graph_objects as go
import plotly.express as px

In [47]:
# Set up connection to AWS RDS
config1 = configparser.ConfigParser()
config1.read('../config.ini')
ENDPOINT = config1.get('aws', 'ENDPOINT')
PORT = config1.get('aws', 'PORT')
USR = config1.get('aws', 'USER')
PWD = config1.get('aws', 'PASSWORD')
DB = config1.get('aws', 'DATABASE')

In [2]:
# Read csv file to DataFrame
df = pd.read_csv('votes.csv', index_col=0)

# Convert 'not voting' and 'present' votes to NaN values
df = pd.DataFrame(np.where(df == 0, np.nan, df), index=df.index, columns=df.columns)

## Processing Data for Analysis
Metrics for voting similarity (how often 2 senators are in agreement) and principal component analysis visualizations will be created.  In this section, the EC2 instance will process the information and then pass the data in the form of csv files to be read by the Heroku app.

### Senator similarity by agreement/(disagree+agree)

In [3]:
# Function to return agreement/total votes
def vote_sim(v1, v2):
    return sum(abs(abs(v1 - v2)/2 - 1)) / len(v1)

In [94]:
# Create matrix for voting similarity between senators
def similarity_matrix(df):
    senators = list(df.index)
    l = len(senators)

    sim_mat = np.zeros((l,l))
    for i in range(l):
        for j in range(l):
            if i != j:
                temp = df.loc[[senators[i], senators[j]]].dropna(axis=1)
                v1 = temp.loc[senators[i]]
                v2 = temp.loc[senators[j]]
                sim = vote_sim(v1, v2)
                sim_mat[i][j] = round(sim, 2)
            else:
                sim_mat[i][j] = np.nan
    return sim_mat

In [64]:
sim_mat = similarity_matrix(df)
sim_df = pd.DataFrame(sim_mat, index=df.index, columns=df.index)

In [66]:
sim_df.head()

Unnamed: 0,Charles Grassley,Patrick Leahy,Mitch McConnell,Richard Shelby,Dianne Feinstein,Patty Murray,James Inhofe,Ron Wyden,Susan Collins,Richard Durbin,...,Marsha Blackburn,Mike Braun,Kevin Cramer,Joshua Hawley,Martha McSally,Mitt Romney,Jacky Rosen,Rick Scott,Kyrsten Sinema,Kelly Loeffler
Charles Grassley,,0.38,0.88,0.81,0.41,0.39,0.87,0.39,0.73,0.37,...,0.82,0.81,0.95,0.85,0.91,0.82,0.53,0.85,0.58,0.94
Patrick Leahy,0.38,,0.37,0.42,0.88,0.92,0.3,0.9,0.58,0.92,...,0.32,0.31,0.51,0.42,0.51,0.4,0.93,0.39,0.88,0.38
Mitch McConnell,0.88,0.37,,0.85,0.4,0.37,0.89,0.37,0.72,0.35,...,0.75,0.73,0.94,0.8,0.9,0.76,0.61,0.83,0.63,0.88
Richard Shelby,0.81,0.42,0.85,,0.43,0.4,0.88,0.38,0.72,0.36,...,0.8,0.75,0.92,0.83,0.87,0.83,0.52,0.85,0.57,0.86
Dianne Feinstein,0.41,0.88,0.4,0.43,,0.9,0.34,0.9,0.6,0.91,...,0.37,0.32,0.54,0.45,0.54,0.44,0.96,0.43,0.91,0.4


In [67]:
# Write voting similarity matrix as csv
sim_df.to_csv('voting_sim.csv')

In [69]:
# Retrieve most recent congress number
def get_congress_number():
    conn = psycopg2.connect(
        host=ENDPOINT,
        user=USR,
        password=PWD,
        port=PORT,
        database=DB
    )

    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    cursor = conn.cursor()
    cursor.execute(
        """
        SELECT MAX(congress) FROM bills
        ;
        """
    )
    
    current_congress = cursor.fetchone()[0]
    cursor.close()
    conn.close()
    return current_congress

In [74]:
# Get list of bills from congress (by current of congress number)
def get_bills_list(cong_number=116, current=True):
    if current == True:
        congress = get_congress_number()
    else:
        congress = cong_number
        
    bills = []
    for col in list(df.columns):
        if re.search(f'^{congress}', col):
            bills.append(col)
    return bills

In [86]:
bills = get_bills_list(_)
df_116 = df[bills].copy(deep=True)

### Dealing with NaN values for EDA and Data Viz
In the current congress (116th) there have been (as of August 6th, 2020) 179 bills voted on.  It is possible to deal with NaN values by dropped a couple of bills because of non-participating members or having 0's appended to the NaN values.  However, 1 senator had only started in the 2nd session of the current congress (with 72 total votes thus far) and must be dropped in order to perform PCA or meaningful similarity comparisons for this particular congress.

In [87]:
df_116.T.describe() # Drop Kelly Loeffler, only started in most recent session of congress

Unnamed: 0,Charles Grassley,Patrick Leahy,Mitch McConnell,Richard Shelby,Dianne Feinstein,Patty Murray,James Inhofe,Ron Wyden,Susan Collins,Richard Durbin,...,Marsha Blackburn,Mike Braun,Kevin Cramer,Joshua Hawley,Martha McSally,Mitt Romney,Jacky Rosen,Rick Scott,Kyrsten Sinema,Kelly Loeffler
count,179.0,178.0,179.0,176.0,178.0,173.0,172.0,177.0,179.0,173.0,...,175.0,177.0,175.0,178.0,173.0,175.0,175.0,172.0,177.0,72.0
mean,0.441341,0.325843,0.284916,0.363636,0.359551,0.398844,0.197674,0.186441,0.631285,0.32948,...,0.2,0.152542,0.405714,0.258427,0.410405,0.234286,0.44,0.232558,0.457627,0.5
std,0.899857,0.948091,0.961241,0.934199,0.935758,0.919681,0.98313,0.985253,0.777726,0.946903,...,0.982607,0.991101,0.916623,0.968756,0.914551,0.974957,0.900575,0.975422,0.891667,0.872103
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.5
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [88]:
df_116.drop(index='Kelly Loeffler', inplace=True)

In [95]:
sim_116 = pd.DataFrame(similarity_matrix(df_116), index=df_116.index, columns=df_116.index)

In [105]:
sim_116

Unnamed: 0,Charles Grassley,Patrick Leahy,Mitch McConnell,Richard Shelby,Dianne Feinstein,Patty Murray,James Inhofe,Ron Wyden,Susan Collins,Richard Durbin,...,Cindy Hyde-Smith,Marsha Blackburn,Mike Braun,Kevin Cramer,Joshua Hawley,Martha McSally,Mitt Romney,Jacky Rosen,Rick Scott,Kyrsten Sinema
Charles Grassley,,0.46,0.89,0.91,0.48,0.49,0.87,0.38,0.74,0.46,...,0.93,0.82,0.81,0.95,0.85,0.91,0.82,0.53,0.85,0.58
Patrick Leahy,0.46,,0.53,0.45,0.95,0.94,0.35,0.86,0.71,0.97,...,0.48,0.32,0.31,0.51,0.42,0.51,0.40,0.93,0.39,0.88
Mitch McConnell,0.89,0.53,,0.89,0.58,0.60,0.83,0.47,0.74,0.54,...,0.90,0.75,0.73,0.94,0.80,0.90,0.76,0.61,0.83,0.63
Richard Shelby,0.91,0.45,0.89,,0.50,0.52,0.88,0.39,0.75,0.46,...,0.97,0.80,0.75,0.92,0.83,0.87,0.83,0.52,0.85,0.57
Dianne Feinstein,0.48,0.95,0.58,0.50,,0.97,0.40,0.85,0.75,0.97,...,0.51,0.37,0.32,0.54,0.45,0.54,0.44,0.96,0.43,0.91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Martha McSally,0.91,0.51,0.90,0.87,0.54,0.56,0.80,0.44,0.79,0.53,...,0.90,0.78,0.76,0.95,0.88,,0.78,0.59,0.83,0.64
Mitt Romney,0.82,0.40,0.76,0.83,0.44,0.45,0.83,0.36,0.67,0.41,...,0.83,0.80,0.79,0.79,0.78,0.78,,0.46,0.82,0.51
Jacky Rosen,0.53,0.93,0.61,0.52,0.96,0.98,0.42,0.86,0.78,0.94,...,0.55,0.39,0.35,0.58,0.49,0.59,0.46,,0.46,0.95
Rick Scott,0.85,0.39,0.83,0.85,0.43,0.44,0.89,0.32,0.66,0.39,...,0.86,0.88,0.85,0.87,0.88,0.83,0.82,0.46,,0.50


In [162]:
sim_116.loc['Elizabeth Warren']['Edward Markey']

0.97