In [38]:
import os
import json
import pandas as pd

In [39]:
def flatten_case_info(input_file):
    '''
    Flattens json from https://convokit.cornell.edu/documentation/supreme.html

    Note: use only with case information json

    Input: case information json file

    Returns: Flattened dictionary
    '''
    max_num_advocates = 0
    advocate_info_to_keep = ['id', 'side']
    single_issues_to_keep = ['id', 'year', 'title', 'petitioner', 'respondent', 'adv_sides_inferred', 'known_respondent_adv', 'win_side', 'is_eq_divided']
    output = []
    with open(str(input_file), 'r') as f:
        for case_info in f:
            case_info = json.loads(case_info) # Make sure to load the line in as a dict not a str
            output_d = {}
            for k, v in case_info.items():
                if not isinstance(v, dict) and not isinstance(v, list):
                    if k in single_issues_to_keep:
                        output_d[k] = v
                elif k == "transcripts":
                    continue # Don't need
                elif k == "advocates":
                    j = 1
                    for person_info in v.values():  # Ignore actual name of attorney (key)
                        for key, identifier in person_info.items():
                            if key in advocate_info_to_keep:
                                long_key = k + "_" + str(j) + "_" + key
                                output_d[long_key] = identifier
                        max_num_advocates = max(max_num_advocates, j)
                        j += 1
                elif k == 'votes_side': # Let us know with petitioner only
                    for judge, vote in v.items():
                        long_key = k + "_" + judge
                        output_d[long_key] = vote
            output.append(output_d)
    return output

In [40]:
def count_columns(df_cols, prefix, divisor=1):
    '''
    Get count of substrings occuring in columns. Helpful in getting alphabetical cols

    Inputs:
        df_cols (list of str): col names to check
        prefix (str): prefix to check col names for
    
    Returns:
        cnt (int)
    '''
    cnt = 0
    for col in df_cols:
        if col.startswith(prefix):
            cnt += 1
    return cnt // divisor

In [41]:
def filter_columns(df_cols, prefix):
    '''
    Used to filter a list for substrings. Helpful in getting alphabetical cols

    Inputs:
        df_cols (list of str): col names to check
        prefix (str): prefix to check col names for
    
    Returns:
        cols (list)
    '''
    cols = []
    for col in df_cols:
        if col.startswith(prefix):
            cols.append(col)
    return sorted(cols)

In [42]:
curr_path = os.getcwd()
input_file = curr_path + "\cases.jsonl"
print(input_file, type(input_file))

c:\Users\matth\OneDrive\Documents\Harris\2nd_year\3rd_quarter\Machine_Learning_(NLP)\final_project\supreme_court_nlp\case_info_parsing\cases.jsonl <class 'str'>


In [43]:
list_of_dict = flatten_case_info(input_file)
df = pd.DataFrame(list_of_dict)
df

Unnamed: 0,id,year,title,petitioner,respondent,adv_sides_inferred,known_respondent_adv,advocates_1_id,advocates_1_side,advocates_2_id,...,votes_side_j__david_h_souter,votes_side_j__clarence_thomas,votes_side_j__ruth_bader_ginsburg,votes_side_j__stephen_g_breyer,votes_side_j__john_g_roberts_jr,votes_side_j__samuel_a_alito_jr,votes_side_j__sonia_sotomayor,votes_side_j__elena_kagan,votes_side_j__neil_gorsuch,votes_side_j__brett_m_kavanaugh
0,1955_71,1955,Affronti v. United States,Affronti,United States,True,True,harry_f_murphy,1.0,john_v_lindsay,...,,,,,,,,,,
1,1955_410,1955,"American Airlines, Inc. v. North American Airl...","American Airlines, Inc.","North American Airlines, Inc.",True,True,howard_c_westwood,1.0,walter_j_derenberg,...,,,,,,,,,,
2,1955_351,1955,Archawski v. Hanioti,Archawski,Hanioti,True,False,harry_d_graham,3.0,israel_convisser,...,,,,,,,,,,
3,1955_38,1955,Armstrong v. Armstrong,Armstrong,Armstrong,True,False,robert_n_gorman,3.0,walter_k_sibbald,...,,,,,,,,,,
4,1955_49,1955,"Bernhardt v. Polygraphic Company of America, Inc.",Bernhardt,"Polygraphic Company of America, Inc.",True,False,manfred_w_ehrich_jr,3.0,joseph_a_mcnamara,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7743,2019_19-46,2019,U.S. Patent and Trademark Office v. Booking.co...,United States Patent and Trademark Office,Booking.com B.V.,False,True,erica_l_ross,1.0,lisa_s_blatt,...,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7744,2019_19-177,2019,United States Agency for International Develop...,United States Agency for International Develop...,"Alliance for Open Society International, Inc.,...",False,True,christopher_g_michel,1.0,david_w_bowker,...,,1.0,0.0,0.0,1.0,1.0,0.0,,1.0,1.0
7745,2019_18-1584,2019,United States Forest Service v. Cowpasture Riv...,"United States Forest Service, et al.","Cowpasture River Association, et al.",False,True,anthony_a_yang,1.0,paul_d_clement,...,,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
7746,2019_19-67,2019,United States v. Sineneng-Smith,United States of America,Evelyn Sineneng-Smith,False,True,eric_j_feigin,1.0,mark_c_fleming,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
cols = df.columns
max_advocates = count_columns(cols, 'advocates_', 2) # have 2 columns associated with advocate (id, side)

In [45]:
advocate_subcols = ['id', 'side']
advocate_cols = []
for i in range(1, max_advocates + 1):
    for advocate_subcol in advocate_subcols:
        advocate_cols.append("advocates_" + str(i) + "_" + advocate_subcol)

In [46]:
init_cols = ['id', 'year', 'title', 'petitioner', 'respondent', 'adv_sides_inferred', 'known_respondent_adv', 'win_side', 'is_eq_divided'] # Get rid of win side_detail

votes_side = filter_columns(cols, "votes_side_j") # Some cases have no votes such as https://www.oyez.org/cases/1964/17-orig

cols_to_keep = init_cols + votes_side + advocate_cols

In [47]:
df = df[cols_to_keep]

In [48]:
df.to_csv('case_info_relevant_cols_only.csv', index=False)