In [7]:
import os
import json
import pandas as pd

In [8]:
def flatten_case_info(input_file):
    '''
    Flattens json from https://convokit.cornell.edu/documentation/supreme.html

    Note: use only with case information json

    Input: case information json file

    Returns: Flattened dictionary
    '''
    max_num_advocates = 0
    advocate_info_to_keep = ['id', 'side']
    single_issues_to_keep = ['id', 'year', 'title', 'petitioner', 'respondent', 'adv_sides_inferred', 'known_respondent_adv', 'win_side', 'is_eq_divided']
    output = []
    with open(str(input_file), 'r') as f:
        for case_info in f:
            case_info = json.loads(case_info) # Make sure to load the line in as a dict not a str
            output_d = {}
            for k, v in case_info.items():
                if not isinstance(v, dict) and not isinstance(v, list):
                    if k in single_issues_to_keep:
                        output_d[k] = v
                elif k == "transcripts":
                    continue # Don't need
                elif k == "advocates":
                    j = 1
                    for person_info in v.values():  # Ignore actual name of attorney (key)
                        for key, identifier in person_info.items():
                            if key in advocate_info_to_keep:
                                long_key = k + "_" + str(j) + "_" + key
                                output_d[long_key] = identifier
                        max_num_advocates = max(max_num_advocates, j)
                        j += 1
                elif k == 'votes_side': # Let us know with petitioner only
                    for judge, vote in v.items():
                        long_key = k + "_" + judge
                        output_d[long_key] = vote
            output.append(output_d)
    return output

In [9]:
def count_columns(df_cols, prefix, divisor=1):
    '''
    Get count of substrings occuring in columns. Helpful in getting alphabetical cols

    Inputs:
        df_cols (list of str): col names to check
        prefix (str): prefix to check col names for
    
    Returns:
        cnt (int)
    '''
    cnt = 0
    for col in df_cols:
        if col.startswith(prefix):
            cnt += 1
    return cnt // divisor

In [10]:
def filter_columns(df_cols, prefix):
    '''
    Used to filter a list for substrings. Helpful in getting alphabetical cols

    Inputs:
        df_cols (list of str): col names to check
        prefix (str): prefix to check col names for
    
    Returns:
        cols (list)
    '''
    cols = []
    for col in df_cols:
        if col.startswith(prefix):
            cols.append(col)
    return sorted(cols)

In [None]:
curr_path = os.getcwd()
input_file = os.path.join(curr_path, "cases.jsonl")
print(input_file, type(input_file))

In [None]:
list_of_dict = flatten_case_info(input_file)
df = pd.DataFrame(list_of_dict)
df

In [13]:
cols = df.columns
max_advocates = count_columns(cols, 'advocates_', 2) # have 2 columns associated with advocate (id, side)

In [14]:
advocate_subcols = ['id', 'side']
advocate_cols = []
for i in range(1, max_advocates + 1):
    for advocate_subcol in advocate_subcols:
        advocate_cols.append("advocates_" + str(i) + "_" + advocate_subcol)

In [15]:
init_cols = ['id', 'year', 'title', 'petitioner', 'respondent', 'adv_sides_inferred', 'known_respondent_adv', 'win_side', 'is_eq_divided'] # Get rid of win side_detail

votes_side = filter_columns(cols, "votes_side_j") # Some cases have no votes such as https://www.oyez.org/cases/1964/17-orig

cols_to_keep = init_cols + votes_side + advocate_cols

In [16]:
df = df[cols_to_keep]

In [17]:
df.to_csv('case_info_relevant_cols_only.csv', index=False)