# Data Functions for EDA and App
Notebook walking through functions with example

In [123]:
# Standard imports
import numpy as np
import pandas as pd
import pickle

# sklearn imports
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

# Plotting imports
import plotly.graph_objects as go
import plotly.express as px
import networkx as nx

In [2]:
# Binary
with open('./app/data/adj_df.p', 'rb') as f:
    df1 = pickle.load(f)

# Multiclass
with open('./app/data/mul_df.p', 'rb') as f:
    df2 = pickle.load(f)

## Functions for persistent variables
Below are functions to create a list of the different courts between 1999 and 2019.

In [26]:
# Helper function to get sorted list of justices
def justice_terms(df):
    justices = list(df.index)
    justices_by_cases = []
    for justice in justices:
        first_case = min(df.loc[justice].dropna().index)
        last_case = max(df.loc[justice].dropna().index)
        case_range = (first_case, last_case)
        justices_by_cases.append((justice, case_range))
    justices_by_cases.sort(key=lambda x: (x[1][0], x[1][1]))
    return [ justice[0] for justice in justices_by_cases ]

def get_courts(df):
    '''
    Returns list of different court compositions
    '''
    courts = []
    justices = justice_terms(df)
    i = 0
    j = 9
    while j <= len(df.index):
        court = justices[i:j]
        courts.append(court)
        i += 1
        j += 1
    return courts

In [27]:
# All courts between 1999 and 2019
all_courts = get_courts(df2)
for court in all_courts:
    print(court)

['Rehnquist', "O'Connor", 'Souter', 'Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas']
["O'Connor", 'Souter', 'Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts']
['Souter', 'Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito']
['Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor']
['Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor', 'Kagan']
['Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor', 'Kagan', 'Gorsuch']
['Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor', 'Kagan', 'Gorsuch', 'Kavanaugh']


## Functions to get data for courts

In [34]:
def get_opinions(df, court):
    '''
    Returns numpy array of a particular court's opinions dropping columns with NaN values
    '''
    return np.array(df.loc[court].dropna(axis=1))

In [36]:
# Opinions as a numpy array
first = get_opinions(df2, all_courts[0])
first

array([[ 2.,  2.,  2., ..., -2.,  2.,  2.],
       [ 2.,  2.,  2., ...,  2.,  2.,  2.],
       [ 2.,  2.,  2., ...,  2.,  2., -2.],
       ...,
       [ 2.,  2.,  2., ...,  2.,  2.,  2.],
       [ 2.,  2.,  2., ...,  2.,  2., -2.],
       [ 2.,  2.,  2., ..., -2.,  2.,  2.]])

In [37]:
# Pandas DataFrame of opinions
pd.DataFrame(first, index=all_courts[0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,455,456,457,458,459,460,461,462,463,464
Rehnquist,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,-2.0,2.0,2.0,2.0,2.0,-2.0,2.0,2.0
O'Connor,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,-2.0,2.0,2.0,2.0,-2.0,2.0,2.0,2.0,2.0,2.0
Souter,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,-2.0,-2.0,2.0,-2.0,2.0,2.0,-2.0
Stevens,-2.0,-2.0,-2.0,-2.0,-2.0,2.0,-2.0,2.0,2.0,-2.0,...,-2.0,2.0,2.0,-2.0,-2.0,-2.0,-2.0,2.0,2.0,2.0
Scalia,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,-2.0,2.0,2.0,2.0,2.0,-2.0,2.0,-2.0
Kennedy,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,-2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,-2.0,2.0,2.0
Breyer,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,-2.0,2.0,2.0,2.0,1.0,2.0,-2.0,2.0,2.0,2.0
Ginsburg,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,-2.0,2.0,2.0,2.0,-2.0,-2.0,-2.0,2.0,2.0,-2.0
Thomas,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,-2.0,2.0,2.0,2.0,2.0,-2.0,2.0,2.0


## Function for cosine similarities between justices and plotting

In [38]:
def get_sim(court_op):
    '''
    Returns numpy cosine similarity matrix from an array of opinions
    '''
    sim_mat = np.zeros((9,9))
    for i in range(9):
        for j in range(9):
            j_a = court_op[i]
            j_a = j_a.reshape(1, len(j_a))
            j_b = court_op[j]
            j_b = j_b.reshape(1, len(j_b))
            sim_mat[i][j] = np.round(cosine_similarity(j_a, j_b), 4)
    return sim_mat

In [40]:
# Similarity matrix of first court
sim_mat = get_sim(first)
pd.DataFrame(sim_mat, index=all_courts[0], columns=all_courts[0])

Unnamed: 0,Rehnquist,O'Connor,Souter,Stevens,Scalia,Kennedy,Breyer,Ginsburg,Thomas
Rehnquist,1.0,0.7231,0.3325,0.1907,0.701,0.7834,0.4081,0.3192,0.7334
O'Connor,0.7231,1.0,0.5241,0.374,0.5413,0.6784,0.5838,0.4838,0.5362
Souter,0.3325,0.5241,1.0,0.7143,0.2545,0.4295,0.7499,0.8395,0.2656
Stevens,0.1907,0.374,0.7143,1.0,0.0951,0.3187,0.6389,0.716,0.1318
Scalia,0.701,0.5413,0.2545,0.0951,1.0,0.656,0.2561,0.2228,0.8595
Kennedy,0.7834,0.6784,0.4295,0.3187,0.656,1.0,0.4471,0.3876,0.6708
Breyer,0.4081,0.5838,0.7499,0.6389,0.2561,0.4471,1.0,0.8005,0.2192
Ginsburg,0.3192,0.4838,0.8395,0.716,0.2228,0.3876,0.8005,1.0,0.2228
Thomas,0.7334,0.5362,0.2656,0.1318,0.8595,0.6708,0.2192,0.2228,1.0


In [41]:
def most_similar(df, j1):
    similarity = {}
    other_justices = list(df.index)
    other_justices.remove(j1)
    for j2 in other_justices:
        temp_df = df.loc[[j1, j2]].dropna(axis=1)
        if len(temp_df.columns) != 0:
            X1 = np.array(temp_df.loc[j1])
            X2 = np.array(temp_df.loc[j2])
            similarity[j2] = round(float(cosine_similarity(X1.reshape(1, len(X1)), X2.reshape(1, len(X2)))), 3)
    return similarity

In [42]:
most_similar(df2, 'Alito')

{'Breyer': 0.354,
 'Ginsburg': 0.261,
 'Gorsuch': 0.607,
 'Kagan': 0.347,
 'Kavanaugh': 0.707,
 'Kennedy': 0.669,
 'Roberts': 0.762,
 'Scalia': 0.703,
 'Sotomayor': 0.26,
 'Souter': 0.236,
 'Stevens': 0.132,
 'Thomas': 0.748}

In [316]:
def sim_heatmap(sim_mat, justices):
    '''
    Returns Plotly heatmap figure
    '''
    fig = go.Figure(data=(go.Heatmap(z=sim_mat, x=justices, y=justices, colorscale='Inferno')))
    fig.update_layout(title='Heatmap of Cosine Similarity Between Justices',
                      height=500,
                      width=500,
                     )
    return fig

In [317]:
sim_heatmap(sim_mat, souter[0])

## Functions for PCA and plotting PCA

In [28]:
def get_pca(court_op, court):
    '''
    2-component representation of justices within a particular court (using PCA), returns DataFrame
    '''
    pca = PCA(n_components=2)
    comp = pd.DataFrame(pca.fit_transform(court_op), index=court, columns=['pc1', 'pc2'])

    # Maintain consistent axes based on Ginsburg/Thomas
    if comp.loc['Ginsburg']['pc1'] >=0:
        comp['pc1'] = -comp['pc1']
    if comp.loc['Thomas']['pc2'] >= 0:
        comp['pc2'] = -comp['pc2']
    
    # Scale for consistency
    scaler = MinMaxScaler()
    comp = scaler.fit_transform(comp)
    comp = pd.DataFrame(comp)
    
    comp.reset_index(inplace=True)
    comp.columns = ['justice', 'pc1', 'pc2']
    comp['justice'] = court
    return comp

def all_pca_df(df, court_opinions, courts):
    '''
    Returns a merged 2-component DataFrame
    '''
    pca_dfs = []
    for n in range(len(court_opinions)):
        temp_df = get_pca(court_opinions[n], courts[n])
        temp_df['court'] = n
        pca_dfs.append(temp_df)

    pca_dfs = pd.concat(pca_dfs).reset_index(drop=True)
    return pca_dfs

In [32]:
all_court_opinions = [ get_opinions(df2, court) for court in all_courts ]
pcas = all_pca_df(df2, all_court_opinions, all_courts)

In [33]:
pcas

Unnamed: 0,justice,pc1,pc2,court
0,Rehnquist,0.908512,0.737965,0
1,O'Connor,0.620810,1.000000,0
2,Souter,0.074027,0.310760,0
3,Stevens,0.000000,0.123450,0
4,Scalia,0.998957,0.024039,0
...,...,...,...,...
58,Alito,0.962074,0.688246,6
59,Sotomayor,0.000000,0.394951,6
60,Kagan,0.102127,0.541335,6
61,Gorsuch,0.753805,0.000000,6


In [18]:
def animated_2comp(df):
    '''
    Returns Plotly 2 component scatter plot figure
    '''
    fig = px.scatter(df, x='pc1', y='pc2',
                     animation_frame='court',
                     animation_group='justice',
                     text='justice',
                     title='Justices Along 2 Components (PCA)',
                     labels={'pc1': 'PC1', 'pc2': 'PC2'},
                     width=500,
                     height=500,
                     range_x=(-0.2, 1.2),
                     range_y=(-0.2, 1.2),
                    )
    fig.update_traces(textposition='top center')
    fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 2000
    fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 3000
    return fig

In [19]:
animated_2comp(pcas)

## Function to create similarity matrix for network graph

In [308]:
def all_sim(df):
    '''
    Returns a similarity DataFrame for all justices (justices who did no serve together are NaN values)
    '''
    jus = list(df.index)
    l = len(jus)
    sim_mat = np.zeros((l,l))

    for i in range(l):
        for j in range(l):   
            anb = np.where(df.loc[jus[i]].notna() & df.loc[jus[j]].notna(), df.columns, np.nan)
            if len([ x for x in anb if str(x) != 'nan' ]) != 0:
                j_a = np.array(df[[ x for x in anb if str(x) != 'nan' ]].loc[jus[i]])
                j_b = np.array(df[[ x for x in anb if str(x) != 'nan' ]].loc[jus[j]])
                sim_mat[i][j] = np.round(cosine_similarity(j_a.reshape(1, len(j_a)), j_b.reshape(1, len(j_a))), 4)
            else:
                sim_mat[i][j] = np.nan
    
    sim_mat = pd.DataFrame(sim_mat, index=jus, columns=jus)
    return sim_mat

In [309]:
sim_df = all_sim(df2)
sim_df

Unnamed: 0,Alito,Breyer,Ginsburg,Gorsuch,Kagan,Kavanaugh,Kennedy,O'Connor,Rehnquist,Roberts,Scalia,Sotomayor,Souter,Stevens,Thomas
Alito,1.0,0.3545,0.2612,0.6074,0.3474,0.707,0.669,,,0.7618,0.703,0.2597,0.2359,0.1316,0.7484
Breyer,0.3545,1.0,0.7557,0.2098,0.8081,0.4586,0.5388,0.6103,0.4175,0.4845,0.2892,0.7509,0.729,0.6521,0.242
Ginsburg,0.2612,0.7557,1.0,0.2429,0.8099,0.3594,0.4692,0.5027,0.3493,0.3958,0.2807,0.7947,0.8063,0.7033,0.2216
Gorsuch,0.6074,0.2098,0.2429,1.0,0.3237,0.5792,0.6885,,,0.5919,,0.2255,,,0.6606
Kagan,0.3474,0.8081,0.8099,0.3237,1.0,0.4566,0.6331,,,0.5144,0.4247,0.7942,,,0.2892
Kavanaugh,0.707,0.4586,0.3594,0.5792,0.4566,1.0,,,,0.8845,,0.33,,,0.5741
Kennedy,0.669,0.5388,0.4692,0.6885,0.6331,,1.0,0.6871,0.7926,0.7266,0.6201,0.5501,0.4499,0.3548,0.582
O'Connor,,0.6103,0.5027,,,,0.6871,1.0,0.7249,0.8333,0.5619,,0.5455,0.4015,0.5489
Rehnquist,,0.4175,0.3493,,,,0.7926,0.7249,1.0,,0.7076,,0.3586,0.2193,0.741
Roberts,0.7618,0.4845,0.3958,0.5919,0.5144,0.8845,0.7266,0.8333,,1.0,0.7784,0.4542,0.3755,0.263,0.6574


## Functions to build network graph
The following are functions and helper functions to build the edges in a network, a graph network, the traces for each edge, the trace for the nodes in the graph network, and finally a plot of the graph network.

In [310]:
# Scale data helper function
def scale_data(df):
    jus = list(df.index)
    scaler = MinMaxScaler()
    new_df = pd.DataFrame(scaler.fit_transform(df), index=jus, columns=jus)
    return new_df

# Edge builder helper function
def build_edges(df):
    l = len(df.index) # number of nodes
    edges = []
    for i in range(l):
        for j in range(i+1, l):
            if str(df.iloc[i][j]) != 'nan':
                tup = (df.iloc[i].name, df.iloc[i].index[j], df.iloc[i][j]) # (justice A, justice B, sim)
                edges.append(tup)
    return edges

In [311]:
def build_network(df):
    '''
    Build network function, returns nx.Graph object
    '''
    G = nx.Graph()
    new_df = scale_data(df)
    edges = build_edges(new_df)
    
    # Add edges to graph
    for edge in edges:
        G.add_edge(edge[0], edge[1], weight=edge[2])
    
    # Positions of nodes with Fruchterman-Reingold force-directed algorithm
    pos = nx.spring_layout(G)
    
    return G, pos

In [312]:
# Helper function to treat each edge as a separate trace
def get_edge_traces(G, pos, df):
    edge_traces = []
    for edge in G.edges():
        edge_x = []
        edge_y = []
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        
        # Exponent and multiplier applied to weight of edges
        width = 10 * df.loc[edge[0]][edge[1]]**3
        edge_trace = go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=width, color='red'),
            hoverinfo='none',
            mode='lines')
        edge_traces.append(edge_trace)
        
    return edge_traces

In [313]:
# Helper function to get node trace
def get_node_trace(G, pos, sim_df, cases_df):
    node_x = []
    node_y = []
    node_name = []
    node_size = []
    cases = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_size.append(len(cases_df.loc[node].dropna()) / 50)
        node_name.append(node)
        cases.append(len(cases_df.loc[node].dropna()))

    node_trace = go.Scatter(
        x=node_x, y=node_y, text=node_name, customdata=cases,
        mode='markers+text', marker=dict(size=node_size),
        hovertemplate='Justice %{text}<br>Cases: %{customdata}')
    return node_trace

In [314]:
def plot_network(sim_df, cases_df):
    '''
    Returns Plotly figure for networkX graph
    '''
    G, pos = build_network(sim_df)
    edge_traces = get_edge_traces(G, pos, sim_df)
    node_trace = get_node_trace(G, pos, sim_df, cases_df)
    
    fig = go.Figure(
        data=edge_traces+[node_trace],
        layout=go.Layout(
            title='SCOTUS Similarity as Network Graph',
            showlegend=False,
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            template='plotly_white'
        )
    )
    return fig

In [315]:
plot_network(sim_df, df2)