# Data Cleaning, Feature Engineering Notebook

In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA

from scipy.spatial.distance import correlation, cosine
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df = pd.read_csv('scotus_rulings.csv', index_col=0)

## Clean data and engineer metric
As seen by the sample rows, a justice can hold multiple opinions on a case.  They can simultaneously agree with the decision of the majority opinion but hold a separate reasoning(s) on the ruling (filing a concurrence).  With 1 being agreement and 2 filing a concurrence (and similarly 4 being dissent and 3 filing a concurrence/dissent).  The metric used here tries to capture the difference between those opinions by finding the mean of a justice's opinion (while removing filing multiple concurrences/dissents in a specific case).

In [3]:
# Map code string to metric function
def string_2_ints(s):    
    # Special case 'X' is no vote
    if s is np.nan:
        return np.nan
    if s == 'X':
        return np.nan
    
    # Split and remove non-integers, simplify to either majority or dissent
    nums = []
    for x in set(list(s)):
        try:
            nums.append(int(x))
        except:
            continue
    
    return np.mean(nums)

In [61]:
adj_df = pd.DataFrame(np.vectorize(string_2_ints)(df), index=df.index, columns=df.columns)

In [84]:
a_n_b = np.where(adj_df.loc['Alito'].notna() & adj_df.loc['Breyer'].notna(), adj_df.columns, np.nan)

In [133]:
adj_df[[ x for x in a_n_b if str(x) != 'nan' ]].loc[['Alito', 'Breyer']]

Unnamed: 0,"Ministry of Defense of Iran v. Elahi, 546 U.S. 450","Ash v. Tyson Foods, Inc., 546 U.S. 454","Lance v. Dennis, 546 U.S. 459","Gonzales v. Thomas, 547 U.S. 183","Salinas v. United States, 547 U.S. 188","Northern Ins. Co. of N.Y. v. Chatham County, 547 U.S. 189","Day v. McDonough, 547 U.S. 198","Ark. Dep't of Human Servs. v. Ahlborn, 547 U.S. 268","Marshall v. Marshall, 547 U.S. 293","Holmes v. South Carolina, 547 U.S. 319",...,"Bostock v. Clayton County, 590 U.S. ___","Andrus v. Texas, 590 U.S. ___","Department of Homeland Security v. Regents of Univ. of Cal., 591 U.S. ___","Liu v. SEC, 591 U.S. ___","Department of Homeland Security v. Thuraissigiam, 591 U.S. ___","Seila Law v. Consumer Financial Protection Bureau, 591 U.S. ___","June Medical Services, LLC v. Russo, 591 U.S. ___","Agency for Int’l Development v. Alliance for Open Society, 591 U.S. ___","Espinoza v. Montana Dept. of Revenue, 591 U.S. ___","Patent and Trademark Office v. Booking.com B. V., 591 U.S. ___"
Alito,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,4.0,4.0,3.0,1.0,1.0,1.0,2.0,1.0,1.5,1.0
Breyer,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.0,3.0,1.0,4.0,4.0,4.0


In [109]:
adj_df.loc['Alito', adj_df.loc['Alito'].dropna().index]

Ministry of Defense of Iran v. Elahi, 546 U.S. 450                         1.0
Ash v. Tyson Foods, Inc., 546 U.S. 454                                     1.0
Lance v. Dennis, 546 U.S. 459                                              1.0
Gonzales v. Thomas, 547 U.S. 183                                           1.0
Salinas v. United States, 547 U.S. 188                                     1.0
                                                                          ... 
Seila Law v. Consumer Financial Protection Bureau, 591 U.S. ___            1.0
June Medical Services, LLC v. Russo, 591 U.S. ___                          2.0
Agency for Int’l Development v. Alliance for Open Society, 591 U.S. ___    1.0
Espinoza v. Montana Dept. of Revenue, 591 U.S. ___                         1.5
Patent and Trademark Office v. Booking.com B. V., 591 U.S. ___             1.0
Name: Alito, Length: 1100, dtype: float64

In [172]:
class scotus_rulings(object):
    def __init__(self, df):
        self.df = df
        self.justices = list(df.index)
        
    def __len__(self):
        return len(self.df.columns)
    
    def __str__(self):
        return f'Justices: {len(self.justices)}\nCases: {len(self.df.columns)}'
    
    def compare_justices(self, justice_a, justice_b):
        '''
        Compare cosine similarity of justices
        '''
        assert justice_a in self.justices, f'\n{justice_a} not a justice between 1999-2019.\nChoose one of {self.justices}'
        assert justice_b in self.justices, f'\n{justice_b} not a justice between 1999-2019.\nChoose one of {self.justices}'
        
        anb = np.where(self.df.loc[justice_a].notna() & self.df.loc[justice_b].notna(), self.df.columns, np.nan)
        assert len([ x for x in anb if str(x) != 'nan' ]) != 0, 'Justices have no rulings together'
        
        j_a = self.df[[ x for x in anb if str(x) != 'nan' ]].loc[justice_a]
        j_b = self.df[[ x for x in anb if str(x) != 'nan' ]].loc[justice_b]
        return 1 - cosine(j_a, j_b)
        
    def justice_term(self, justice):
        assert justice in self.justices, f'Not a justice between 1999-2019.\nChoose one of {self.justices}'
        js = list(self.justices)
        js.remove(justice)
        no_sim = []
        for j in js:
            anb = np.where(self.df.loc[justice].notna() & self.df.loc[j].notna(), self.df.columns, np.nan)
            if len([ x for x in anb if str(x) != 'nan' ]) != 0:
                j_a = self.df[[ x for x in anb if str(x) != 'nan' ]].loc[justice]
                j_b = self.df[[ x for x in anb if str(x) != 'nan' ]].loc[j]
                print(f'{justice} and {j}:', 1 - cosine(j_a, j_b))
            else:
                no_sim.append(j)
        
        for j in no_sim:
            print(f'{justice} has rulings with {j}')

In [173]:
rulings = scotus_rulings(adj_df)

In [174]:
rulings.compare_justices('Sotomayor', 'Ginsburg')

0.8918936001703518

In [176]:
rulings.justice_term('Breyer')

Breyer and Alito: 0.6433383015848825
Breyer and Ginsburg: 0.875795062644884
Breyer and Gorsuch: 0.6124863898976299
Breyer and Kagan: 0.8789712873715966
Breyer and Kavanaugh: 0.6849185723168632
Breyer and Kennedy: 0.7030884163405637
Breyer and O'Connor: 0.7573244557202456
Breyer and Rehnquist: 0.6495090553306012
Breyer and Roberts: 0.6608209550736741
Breyer and Scalia: 0.6210925640100434
Breyer and Sotomayor: 0.8652402601786241
Breyer and Souter: 0.8618637586285807
Breyer and Stevens: 0.8539002396401306
Breyer and Thomas: 0.618518267966187


In [5]:
current = adj_df.iloc[:, -117:].drop(['Kennedy', "O'Connor", 'Rehnquist', 'Scalia', 'Souter', 'Stevens']).dropna(axis=1)

In [8]:
X = current.values

In [9]:
print('Explained Variance:')
print('-------------------')
for n in range(1, 10):
    pca = PCA(n_components=n)
    pca.fit_transform(X)
    print(f'{n} components:', round(sum(list(pca.explained_variance_ratio_))*100, 2))

Explained Variance:
-------------------
1 components: 52.8
2 components: 69.2
3 components: 80.03
4 components: 85.83
5 components: 90.6
6 components: 94.81
7 components: 98.0
8 components: 100.0
9 components: 100.0


In [14]:
pca = PCA(n_components=3)
comps = pd.DataFrame(pca.fit_transform(X), index=current.index)

In [15]:
comps

Unnamed: 0,0,1,2
Alito,9.642828,-1.827207,-3.781748
Breyer,-7.898764,-2.551288,-2.697548
Ginsburg,-8.747425,1.365728,-1.515708
Gorsuch,4.934267,7.376875,6.444388
Kagan,-7.506348,0.124858,-0.054139
Kavanaugh,4.825547,-5.271002,2.846693
Roberts,4.695101,-6.466081,2.992429
Sotomayor,-9.440348,2.063625,0.944497
Thomas,9.495141,5.184492,-5.178864


In [19]:
fig = px.scatter_3d(comps, x=0, y=1, z=2, text=comps.index)
fig.show()