# Data Cleaning, Feature Engineering Notebook

In [1]:
# Standard imports
import numpy as np
import pandas as pd

# Export data
import pickle

# Scotus class object
from app.scotus_class import scotus

In [2]:
df = pd.read_csv('scotus_rulings.csv', index_col=0)

## Clean data and engineer metric
There are 2 ways shown below on how the case features maybe be engineered/cleaned.  The first treats a case as either a vote with the majority or a dissent.  The second maintains the level of concurrence/dissent of a justice.  In each case, a justice can hold multiple opinions.  They can simultaneously agree with the decision of the majority opinion but hold a separate reasoning(s) on the ruling (filing a concurrence).  With 2 being agreement and 1 filing a concurrence (and similarly -2 being dissent and -1 filing a concurrence/dissent).  The metric used here tries to capture the difference between those opinions by finding the mean of a justice's opinion (while removing filing multiple concurrences/dissents in a specific case).

Note, since cosine similarity will be used 0 and 1 cannot be used for siding with the majority or dissenting against the majority.
- Example: if \\(A = [0 1 0]\\) and \\(B = [0 0 0]\\) then \\(similarity = [0 1 0]*[0 0 0]/(||A||*||B||) = (0*0 + 1*0 + 0*0)/(||A||*||B||) = 0\\)
    - So despite "agreeing" on 2 of 3 cases justice A and justice B will be perceived as having zero similarity

In [17]:
# Map code string to metric function (binary)
def string_2_ints(s):    
    # Special case 'X' is no vote
    if s is np.nan:
        return np.nan
    if s == 'X':
        return np.nan
    
    # Split and remove non-integers and simplify to either majority or dissent
    nums = []
    for x in set(list(s)):
        try:
            if int(x) > 2: # Any dissent is assigned as dissent against majority opinion
                nums.append(-1)
            elif int(x) <= 2:
                nums.append(1)
            elif str(x) == 'nan':
                continue
        except:
            continue

    return nums[0]

# Map code string to metric function (multi-class)
def string_2_multi(s):    
    # Special case 'X' is no vote
    if s is np.nan:
        return np.nan
    if s == 'X':
        return np.nan
    
    # Split and remove non-integers
    nums = []
    for x in set(list(s)):
        try:
            nums.append(int(x))
        except:
            continue
    nums = np.floor(np.mean(nums))
    if nums == 1:
        value = 2
    elif nums == 2:
        value = 1
    elif nums == 3:
        value = -1
    elif nums == 4:
        value = -2
    else:
        value == np.nan
    # Return floor of mean value of justices' opinions
    return value

In [18]:
# Apply functions to scraped data
adj_df = pd.DataFrame(np.vectorize(string_2_ints)(df), index=df.index)
mul_df = pd.DataFrame(np.vectorize(string_2_multi)(df), index=df.index)

In [19]:
# Binary
adj_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1638,1639,1640,1641,1642,1643,1644,1645,1646,1647
Alito,,,,,,,,,,,...,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Breyer,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0
Ginsburg,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0


In [20]:
# Multiclass
mul_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1638,1639,1640,1641,1642,1643,1644,1645,1646,1647
Alito,,,,,,,,,,,...,-2.0,-2.0,-1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0
Breyer,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,-1.0,2.0,-2.0,-2.0,-2.0
Ginsburg,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,-1.0,2.0,-2.0,-2.0,2.0
