In [1]:
import os
import json
import pandas as pd

In [2]:
def flatten_case_info(input_file):
    '''
    Flattens json from https://convokit.cornell.edu/documentation/supreme.html

    Note: use only with case information json

    Input: case information json file

    Returns: Flattened dictionary
    '''
    max_num_advocates = 0
    max_num_transcripts = 0
    output = []
    with open(str(input_file), 'r') as f:
        for case_info in f:
            case_info = json.loads(case_info) # Make sure to load the line in as a dict not a str
            output_d = {}
            for k, v in case_info.items():
                if not isinstance(v, dict) and not isinstance(v, list):
                    output_d[k] = v
                elif k == "transcripts":
                    i = 1
                    for transcript_info in v:
                        for k2, v2 in transcript_info.items():
                            long_key = k + "_" + str(i) + "_" + k2
                            output_d[long_key] = v2
                        max_num_transcripts = max(max_num_transcripts, i)
                        i += 1
                elif k == "advocates":
                    j = 1
                    for person_info in v.values():  # Ignore actual name of attorney (key)
                        for key, identifier in person_info.items():
                            long_key = k + "_" + str(j) + "_" + key
                            output_d[long_key] = identifier
                        max_num_advocates = max(max_num_advocates, j)
                        j += 1
                else:
                    for judge, vote in v.items():
                        long_key = k + "_" + judge
                        output_d[long_key] = vote
            output.append(output_d)
        print(max_num_advocates, max_num_transcripts)
    return output

In [3]:
def count_columns(df_cols, prefix, divisor=1):
    '''
    Get count of substrings occuring in columns. Helpful in getting alphabetical cols

    Inputs:
        df_cols (list of str): col names to check
        prefix (str): prefix to check col names for
    
    Returns:
        cnt (int)
    '''
    cnt = 0
    for col in df_cols:
        if col.startswith(prefix):
            cnt += 1
    return cnt // divisor

In [4]:
def filter_columns(df_cols, prefix):
    '''
    Used to filter a list for substrings. Helpful in getting alphabetical cols

    Inputs:
        df_cols (list of str): col names to check
        prefix (str): prefix to check col names for
    
    Returns:
        cols (list)
    '''
    cols = []
    for col in df_cols:
        if col.startswith(prefix):
            cols.append(col)
    return sorted(cols)

In [5]:
curr_path = os.getcwd()
input_file = curr_path + "\cases.jsonl"
print(input_file, type(input_file))

c:\Users\matth\OneDrive\Documents\Harris\2nd_year\3rd_quarter\Machine_Learning_(NLP)\final_project\cases.jsonl <class 'str'>


In [6]:
list_of_dict = flatten_case_info(input_file)
df = pd.DataFrame(list_of_dict)
df

21 9


Unnamed: 0,id,year,citation,title,petitioner,respondent,docket_no,court,decided_date,url,...,votes_side_j__sonia_sotomayor,votes_j__elena_kagan,votes_detail_j__elena_kagan,votes_side_j__elena_kagan,votes_j__neil_gorsuch,votes_detail_j__neil_gorsuch,votes_side_j__neil_gorsuch,votes_j__brett_m_kavanaugh,votes_detail_j__brett_m_kavanaugh,votes_side_j__brett_m_kavanaugh
0,1955_71,1955,350 US 79,Affronti v. United States,Affronti,United States,71,Warren Court,"Dec 5, 1955",https://www.oyez.org/cases/1955/71,...,,,,,,,,,,
1,1955_410,1955,351 US 79,"American Airlines, Inc. v. North American Airl...","American Airlines, Inc.","North American Airlines, Inc.",410,Warren Court,"Apr 23, 1956",https://www.oyez.org/cases/1955/410,...,,,,,,,,,,
2,1955_351,1955,350 US 532,Archawski v. Hanioti,Archawski,Hanioti,351,Warren Court,"Apr 9, 1956",https://www.oyez.org/cases/1955/351,...,,,,,,,,,,
3,1955_38,1955,350 US 568,Armstrong v. Armstrong,Armstrong,Armstrong,38,Warren Court,"Apr 9, 1956",https://www.oyez.org/cases/1955/38,...,,,,,,,,,,
4,1955_49,1955,350 US 198,"Bernhardt v. Polygraphic Company of America, Inc.",Bernhardt,"Polygraphic Company of America, Inc.",49,Warren Court,"Jan 16, 1956",https://www.oyez.org/cases/1955/49,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7743,2019_19-46,2019,591 US _,U.S. Patent and Trademark Office v. Booking.co...,United States Patent and Trademark Office,Booking.com B.V.,19-46,Roberts Court,"Jun 30, 2020",https://www.oyez.org/cases/2019/19-46,...,0.0,2.0,1.0,0.0,2.0,1.0,0.0,2.0,1.0,0.0
7744,2019_19-177,2019,591 US _,United States Agency for International Develop...,United States Agency for International Develop...,"Alliance for Open Society International, Inc.,...",19-177,Roberts Court,"Jun 29, 2020",https://www.oyez.org/cases/2019/19-177,...,0.0,-1.0,-1.0,,2.0,1.0,1.0,2.0,1.0,1.0
7745,2019_18-1584,2019,590 US _,United States Forest Service v. Cowpasture Riv...,"United States Forest Service, et al.","Cowpasture River Association, et al.",18-1584,Roberts Court,"Jun 15, 2020",https://www.oyez.org/cases/2019/18-1584,...,0.0,1.0,2.0,0.0,2.0,1.0,1.0,2.0,1.0,1.0
7746,2019_19-67,2019,590 US _,United States v. Sineneng-Smith,United States of America,Evelyn Sineneng-Smith,19-67,Roberts Court,"May 7, 2020",https://www.oyez.org/cases/2019/19-67,...,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0


In [7]:
cols = df.columns
max_advocates = count_columns(cols, 'advocates_', 4) # have 4 columns associated with advocate (id, name, side, role)
max_transcripts = count_columns(cols, 'transcripts_', 4) # have 4 columns associated with advocate (name, url, id, case_id)
max_votes = count_columns(cols, 'votes_j')
max_votes_detail = count_columns(cols, 'votes_detail_')
max_votes_side = count_columns(cols, 'votes_side_j')

In [8]:
max_votes_side == max_votes == max_votes_detail # Same number judges 

True

In [9]:
advocate_subcols = ['id', 'name', 'side', 'role']
advocate_cols = []
for i in range(1, max_advocates + 1):
    for advocate_subcol in advocate_subcols:
        advocate_cols.append("advocates_" + str(i) + "_" + advocate_subcol)

In [10]:
transcript_subcols = ['id', 'case_id', 'name', 'url']
transcript_cols = []
for j in range(1, max_transcripts + 1):
    for transcript_subcol in transcript_subcols:
        transcript_cols.append("transcripts_" + str(j) + "_" + transcript_subcol)

In [11]:
init_cols = ["id", "year", "citation", "title", "petitioner", "respondent", "docket_no", "court", "decided_date", "url"]

middle_cols_1 = ["adv_sides_inferred", "known_respondent_adv"]

middle_cols_2 = ["win_side", "win_side_detail", "scdb_docket_id"]

votes = ['votes'] + filter_columns(cols, "votes_j") # Some cases have no votes such as https://www.oyez.org/cases/1964/17-orig

votes_detail = ['votes_detail'] + filter_columns(cols, "votes_detail_") # Some cases have no votes such as https://www.oyez.org/cases/1964/17-orig

end_cols = ["is_eq_divided"]

votes_side = ['votes_side'] + filter_columns(cols, "votes_side_j") # Some cases have no votes such as https://www.oyez.org/cases/1964/17-orig


In [12]:
output_cols = init_cols + transcript_cols + middle_cols_1 + advocate_cols + middle_cols_2 + votes + votes_detail + end_cols + votes_side
set(output_cols) == set(cols)

True

In [13]:
df = df[output_cols]

In [14]:
df.to_csv('case_info.csv')