# Data Functions for EDA and App
Notebook walking through functions with example

In [41]:
# Standard imports
import numpy as np
import pandas as pd
import pickle

# sklearn imports
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

# Plotting imports
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# Binary
with open('./app/data/adj_df.p', 'rb') as f:
    df1 = pickle.load(f)

# Multiclass
with open('./app/data/mul_df.p', 'rb') as f:
    df2 = pickle.load(f)

## Functions for persistent variables
Below are functions to create a list of the different courts between 1999 and 2019.

In [3]:
# Helper function to get sorted list of justices
def justice_terms(df):
    justices = list(df.index)
    justices_by_cases = []
    for justice in justices:
        first_case = min(df.loc[justice].dropna().index)
        last_case = max(df.loc[justice].dropna().index)
        case_range = (first_case, last_case)
        justices_by_cases.append((justice, case_range))
    justices_by_cases.sort(key=lambda x: (x[1][0], x[1][1]))
    return [ justice[0] for justice in justices_by_cases ]

def get_courts(df):
    '''
    Returns list of different court compositions
    '''
    courts = []
    justices = justice_terms(df)
    i = 0
    j = 9
    while j <= len(df.index):
        court = justices[i:j]
        courts.append(court)
        i += 1
        j += 1
    return courts

In [4]:
# All courts between 1999 and 2019
all_courts = get_courts(df2)
for court in all_courts:
    print(court)

['Rehnquist', "O'Connor", 'Souter', 'Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas']
["O'Connor", 'Souter', 'Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts']
['Souter', 'Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito']
['Stevens', 'Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor']
['Scalia', 'Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor', 'Kagan']
['Kennedy', 'Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor', 'Kagan', 'Gorsuch']
['Breyer', 'Ginsburg', 'Thomas', 'Roberts', 'Alito', 'Sotomayor', 'Kagan', 'Gorsuch', 'Kavanaugh']


## Functions to get data for individual justice or court

In [5]:
def select_justice(justice):
    '''
    Returns list of courts a justice has participated in, assumes global variable all_courts
    '''
    return [ court for court in all_courts if justice in court ]

def get_opinions(df, court):
    '''
    Returns numpy array of a particular court's opinions dropping columns with NaN values
    '''
    return np.array(df.loc[court].dropna(axis=1))

def get_justice_all(df, courts):
    '''
    Returns list of numpy arrays of court opinions from a list of courts
    '''
    return [ get_opinions(df, court) for court in courts ]

In [6]:
# Get list of courts Justice Souter has participated in
souter = select_justice('Souter')

# Get opinions from those courts
souter_courts = get_justice_all(df2, souter)

In [7]:
# Numpy array of opinions
souter_courts[0]

array([[1., 1., 1., ..., 4., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 4.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 4.],
       [1., 1., 1., ..., 4., 1., 1.]])

In [8]:
# Pandas DataFrame of opinions
pd.DataFrame(souter_courts[0], index=souter[0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,455,456,457,458,459,460,461,462,463,464
Rehnquist,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0
O'Connor,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,1.0
Souter,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,4.0,4.0,1.0,4.0,1.0,1.0,4.0
Stevens,4.0,4.0,4.0,4.0,4.0,1.0,4.0,1.0,1.0,4.0,...,4.0,1.0,1.0,4.0,4.0,4.0,4.0,1.0,1.0,1.0
Scalia,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0
Kennedy,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0
Breyer,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,1.0,1.0,1.0,2.0,1.0,4.0,1.0,1.0,1.0
Ginsburg,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,1.0,1.0,1.0,4.0,4.0,4.0,1.0,1.0,4.0
Thomas,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0


## Function to get similarities between justices

In [9]:
def get_sim(court_op):
    '''
    Returns numpy cosine similarity matrix from an array of opinions
    '''
    sim_mat = np.zeros((9,9))
    for i in range(9):
        for j in range(9):
            j_a = court_op[i]
            j_a = j_a.reshape(1, len(j_a))
            j_b = court_op[j]
            j_b = j_b.reshape(1, len(j_b))
            sim_mat[i][j] = np.round(cosine_similarity(j_a, j_b), 4)
    return sim_mat

In [10]:
# Similarity matrix as a DataFrame
first_court = pd.DataFrame(get_sim(souter_courts[0]), index=souter[0], columns=souter[0])
first_court

Unnamed: 0,Rehnquist,O'Connor,Souter,Stevens,Scalia,Kennedy,Breyer,Ginsburg,Thomas
Rehnquist,1.0,0.7867,0.5844,0.5945,0.8308,0.8422,0.633,0.5998,0.8467
O'Connor,0.7867,1.0,0.6845,0.6905,0.7252,0.7246,0.7298,0.6838,0.7139
Souter,0.5844,0.6845,1.0,0.8749,0.6014,0.632,0.8603,0.9157,0.6019
Stevens,0.5945,0.6905,0.8749,1.0,0.5917,0.6607,0.8394,0.8768,0.6064
Scalia,0.8308,0.7252,0.6014,0.5917,1.0,0.8037,0.6013,0.6015,0.9289
Kennedy,0.8422,0.7246,0.632,0.6607,0.8037,1.0,0.6438,0.63,0.8071
Breyer,0.633,0.7298,0.8603,0.8394,0.6013,0.6438,1.0,0.8945,0.5767
Ginsburg,0.5998,0.6838,0.9157,0.8768,0.6015,0.63,0.8945,1.0,0.5961
Thomas,0.8467,0.7139,0.6019,0.6064,0.9289,0.8071,0.5767,0.5961,1.0


## Functions for PCA and plotting PCA

In [103]:
def get_pca(court_op, court):
    '''
    2-component representation of justices within a particular court (using PCA), returns DataFrame
    '''
    pca = PCA(n_components=2)
    comp = pd.DataFrame(pca.fit_transform(court_op), index=court, columns=['pc1', 'pc2'])

    # Maintain consistent axes based on Ginsburg/Thomas
    if comp.loc['Ginsburg']['pc1'] >=0:
        comp['pc1'] = -comp['pc1']
    if comp.loc['Thomas']['pc2'] >= 0:
        comp['pc2'] = -comp['pc2']
    
    # Scale for consistency
    scaler = MinMaxScaler()
    comp = scaler.fit_transform(comp)
    comp = pd.DataFrame(comp)
    
    comp.reset_index(inplace=True)
    comp.columns = ['justice', 'pc1', 'pc2']
    comp['justice'] = court
    return comp

def all_pca_df(df, courts):
    '''
    Returns a merged 2-component DataFrame
    '''
    pca_dfs = []
    for n in range(len(courts)):
        court_op = get_opinions(df2, courts[n])
        df = get_pca(court_op, courts[n])
        df['court'] = n
        pca_dfs.append(df)

    pca_dfs = pd.concat(pca_dfs).reset_index(drop=True)
    return pca_dfs

In [109]:
pcas = all_pca_df(df2, all_courts)

In [121]:
def animated_2comp(df):
    fig = px.scatter(df, x='pc1', y='pc2',
                     animation_frame='court',
                     animation_group='justice',
                     text='justice',
                     title='Justices Along 2 Components (PCA)',
                     labels={'pc1': 'PC1', 'pc2': 'PC2'},
                     width=700,
                     height=700,
                    )
    fig.update_traces(textposition='top center')
    fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 2000
    fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 3000
    return fig

In [122]:
animated_2comp(pcas)