In [1]:
import os
import pandas as pd

os.chdir('C:\\Users\\Jonas\\Desktop\\UChicago\\term_6\\AdvancedMachineLearning\\supreme_court_nlp')

In [2]:
def get_judge_baseline(year_lb, year_ub):
    """
    Compute the baseline accuracy by judge for a given range of years

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year

    Outputs:
        - baslines (pd.DataFrame): Baselines by judge
    """
    # Load the data
    ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')

    # Count the number of cases vote by side
    vote_counts = ut.groupby(
        'case_id'
        ).first()[
        [
            col for col in ut.columns if col.startswith('votes_side_')
        ]
    ].apply(
        lambda vote: vote.value_counts(dropna=True)
    )
    display(vote_counts)

    # Compute the baseline
    baselines = pd.DataFrame(vote_counts.apply(
                lambda col: col.max() / col.sum(),
                axis=0
            ).dropna(),
            columns=['baseline']
    )

    baselines['baseline_side'] = vote_counts.apply(
        lambda col: col.idxmax(),
        axis=0
    ).dropna()
    
    return baselines[['baseline_side', 'baseline']]

In [3]:
def get_year_baseline(year_lb, year_ub):
    """
    Compute the baseline accuracy at the case level
    for a given range of years

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year

    Outputs:
        - win_side (int): win side of the baseline
        - baseline_side (pd.Series): Baseline
    """
    # Load the data
    counts = pd.read_csv(
        f'data/utterances_clean{year_lb}-{year_ub}.csv',
        usecols=['case_id', 'win_side']
    ).groupby(
        'case_id'
    ).first().value_counts()
    return int(counts.idxmax()[0]), counts.max() / counts.sum()

In [4]:
def get_all_judge_baselines():
    """
    get the judge baselines for all years

    Input:
        - None
    
    Output:
        - baselines (pd.DataFrame): baselines
            for the different judges across the
            years
    """
    year_bounds = [
        (1956, 1956),
        (1957, 1960),
        (1961, 1961),
        (1993, 1993),
        (1994, 1998),
        (1999, 1999),
        (2013, 2013),
        (2014, 2018),
        (2019, 2019)
    ]

    baselines = pd.DataFrame()
    for year_lb, year_ub in year_bounds:
        yr_baseline = get_judge_baseline(
                year_lb,
                year_ub
            )
        yr_baseline['justice'] = yr_baseline.index
        yr_baseline['justice'] = yr_baseline['justice'].apply(
            lambda name: name.strip('votes_side_')
        )
        yr_baseline['lb_year'], yr_baseline['ub_year'] = year_lb, year_ub
        baselines = pd.concat(
            [baselines, yr_baseline],
            axis=0
        ).reset_index(drop=True)
    baselines = baselines[['lb_year', 'ub_year', 'justice', 'baseline_side', 'baseline']]
    baselines.to_csv(f'eda/judge_baselines.csv', index=False)
    return baselines

get_all_judge_baselines()

  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
-1.0,,,,,,,,,,1,...,,,,,,1,,,1,1
0.0,,,,,,,18.0,,,35,...,1.0,,24.0,,,53,,,36,35
1.0,,,,,,,21.0,,,76,...,,,17.0,,,56,,,73,78


  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
-1.0,,,,,,,8,,,8,...,,,,,,8,,,8,8
0.0,,,,,,,239,,,151,...,,,,,,236,,,148,143
1.0,,,,,,,233,,,316,...,,,,,,231,,,324,329


  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
0.0,,,,2,,5,18,,,22,...,,,,,,45,,,29,24
1.0,,,,4,,13,16,,,79,...,,,,,,57,,,73,78


  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
-1.0,,1,1,,,,,1,1,,...,,,,,,,,1,,
0.0,,34,39,,,,,33,36,,...,,,,,,,,42,,
1.0,,46,41,,,,,47,44,,...,,,,,,,,38,,


  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
1.0,,245,233,,,,,221,235,,...,,,,238,,,,230,,
0.0,,162,173,,,,,185,172,,...,,,,164,,,,177,,
-1.0,,2,2,,,,,2,2,,...,,,,2,,,,2,,


  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
1.0,,41,43,,,,,43,43,,...,,,,45,,,,44,,
0.0,,32,30,,,,,30,29,,...,,,,28,,,,29,,
-1.0,,1,1,,,,,1,1,,...,,,,1,,,,1,,


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
1.0,,43,43,,,,,41,,,...,,41,,40,,,,,,
0.0,,23,23,,,,,25,,,...,,24,,25,,,,,,


  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
-1.0,,6,,,,,,6,,,...,,6,,6,,,,,,
0.0,,82,23.0,,21.0,,,143,,,...,,120,,119,,,,,,
1.0,,158,43.0,,40.0,,,166,,,...,,187,,190,,,,,,


  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,votes_side_j__abe_fortas,votes_side_j__anthony_m_kennedy,votes_side_j__antonin_scalia,votes_side_j__arthur_j_goldberg,votes_side_j__brett_m_kavanaugh,votes_side_j__byron_r_white,votes_side_j__charles_e_whittaker,votes_side_j__clarence_thomas,votes_side_j__david_h_souter,votes_side_j__earl_warren,...,votes_side_j__sherman_minton,votes_side_j__sonia_sotomayor,votes_side_j__stanley_reed,votes_side_j__stephen_g_breyer,votes_side_j__thurgood_marshall,votes_side_j__tom_c_clark,votes_side_j__warren_e_burger,votes_side_j__william_h_rehnquist,votes_side_j__william_j_brennan_jr,votes_side_j__william_o_douglas
1.0,,,,,32,,,28,,,...,,28,,38,,,,,,
0.0,,,,,22,,,27,,,...,,26,,17,,,,,,
-1.0,,,,,2,,,2,,,...,,2,,2,,,,,,


Unnamed: 0,lb_year,ub_year,justice,baseline_side,baseline
0,1956,1956,j__charles_e_whittaker,1.0,0.538462
1,1956,1956,j__earl_warren,1.0,0.678571
2,1956,1956,j__felix_frankfurter,1.0,0.530435
3,1956,1956,j__harold_burton,0.0,0.526316
4,1956,1956,j__hugo_l_black,1.0,0.672897
...,...,...,...,...,...
83,2019,2019,j__neil_gorsuch,1.0,0.543860
84,2019,2019,j__ruth_bader_ginsburg,1.0,0.631579
85,2019,2019,j__samuel_a_alito_jr,1.0,0.508772
86,2019,2019,j__sonia_sotomayor,1.0,0.500000


In [5]:
def get_all_year_baselines():
    """
    get the judge baselines for all years

    Input:
        - None
    
    Output:
        - baselines (pd.DataFrame): baselines
            for the different judges across the
            years
    """
    year_bounds = [
        (1956, 1956),
        (1957, 1960),
        (1961, 1961),
        (1993, 1993),
        (1994, 1998),
        (1999, 1999),
        (2013, 2013),
        (2014, 2018),
        (2019, 2019)
    ]

    baselines = pd.DataFrame()
    for year_lb, year_ub in year_bounds:
        baseline_side, baseline = get_year_baseline(
            year_lb,
            year_ub
        )
        yr_baseline = pd.DataFrame(
                {
                    'lb_year': [year_lb],
                    'ub_year': [year_ub],
                    'baseline_side': [baseline_side],
                    'baseline': [baseline]
                }
            )
        baselines = pd.concat(
            [baselines, yr_baseline],
            axis=0
        ).reset_index(drop=True)
    baselines.to_csv(f'eda/year_baselines.csv', index=False)
    return baselines

get_all_year_baselines()

Unnamed: 0,lb_year,ub_year,baseline_side,baseline
0,1956,1956,1,0.626087
1,1957,1960,1,0.582988
2,1961,1961,1,0.705882
3,1993,1993,1,0.506173
4,1994,1998,1,0.601467
5,1999,1999,1,0.594595
6,2013,2013,1,0.681818
7,2014,2018,1,0.64127
8,2019,2019,1,0.596491
