In [1]:
import os
import pandas as pd

os.chdir('C:\\Users\\Jonas\\Desktop\\UChicago\\term_6\\AdvancedMachineLearning\\supreme_court_nlp')

In [2]:
def get_judge_baseline(year_lb, year_ub):
    """
    Compute the baseline accuracy by judge for a given range of years

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year

    Outputs:
        - baslines (pd.DataFrame): Baselines by judge
    """
    # Load the data
    ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')

    # Count the number of cases vote by side
    vote_counts = ut.groupby(
        'case_id'
        ).first()[
        [
            col for col in ut.columns if col.startswith('votes_side_')
        ]
    ].apply(
        lambda vote: vote.value_counts(dropna=True)
    )

    # Compute the baseline
    baselines = pd.DataFrame(vote_counts.apply(
                lambda col: col.max() / col.sum(),
                axis=0
            ).dropna(),
            columns=['baseline']
    )

    baselines['baseline_side'] = vote_counts.apply(
        lambda col: col.idxmax(),
        axis=0
    ).dropna()
    
    return baselines[['baseline_side', 'baseline']]

In [3]:
def get_year_baseline(year_lb, year_ub):
    """
    Compute the baseline accuracy at the case level
    for a given range of years

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year

    Outputs:
        - win_side (int): win side of the baseline
        - baseline_side (pd.Series): Baseline
    """
    # Load the data
    counts = pd.read_csv(
        f'data/utterances_clean{year_lb}-{year_ub}.csv',
        usecols=['case_id', 'win_side']
    ).groupby(
        'case_id'
    ).first().value_counts()
    return int(counts.idxmax()[0]), counts.max() / counts.sum()

In [4]:
def get_all_judge_baselines():
    """
    get the judge baselines for all years

    Input:
        - None
    
    Output:
        - baselines (pd.DataFrame): baselines
            for the different judges across the
            years
    """
    year_bounds = [
        (1955, 1955),
        (1956, 1960),
        (1961, 1961),
        (1993, 1993),
        (1994, 1998),
        (1999, 1999),
        (2013, 2013),
        (2014, 2018),
        (2019, 2019)
    ]

    baselines = pd.DataFrame()
    for year_lb, year_ub in year_bounds:
        yr_baseline = get_judge_baseline(
                year_lb,
                year_ub
            )
        yr_baseline['justice'] = yr_baseline.index
        yr_baseline['justice'] = yr_baseline['justice'].apply(
            lambda name: name.strip('votes_side_')
        )
        yr_baseline['lb_year'], yr_baseline['ub_year'] = year_lb, year_ub
        baselines = pd.concat(
            [baselines, yr_baseline],
            axis=0
        ).reset_index(drop=True)
    baselines = baselines[['lb_year', 'ub_year', 'justice', 'baseline_side', 'baseline']]
    baselines.to_csv(f'eda/judge_baselines.csv', index=False)
    return baselines

get_all_judge_baselines()

  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,lb_year,ub_year,justice,baseline_side,baseline
0,1955,1955,j__earl_warren,1.0,0.604396
1,1955,1955,j__felix_frankfurter,0.0,0.505376
2,1955,1955,j__harold_burton,0.0,0.548387
3,1955,1955,j__hugo_l_black,1.0,0.634409
4,1955,1955,j__john_m_harlan2,0.0,0.520000
...,...,...,...,...,...
84,2019,2019,j__neil_gorsuch,1.0,0.543860
85,2019,2019,j__ruth_bader_ginsburg,1.0,0.631579
86,2019,2019,j__samuel_a_alito_jr,1.0,0.508772
87,2019,2019,j__sonia_sotomayor,1.0,0.500000


In [5]:
def get_all_year_baselines():
    """
    get the judge baselines for all years

    Input:
        - None
    
    Output:
        - baselines (pd.DataFrame): baselines
            for the different judges across the
            years
    """
    year_bounds = [
        (1955, 1955),
        (1956, 1960),
        (1961, 1961),
        (1993, 1993),
        (1994, 1998),
        (1999, 1999),
        (2013, 2013),
        (2014, 2018),
        (2019, 2019)
    ]

    baselines = pd.DataFrame()
    for year_lb, year_ub in year_bounds:
        baseline_side, baseline = get_year_baseline(
            year_lb,
            year_ub
        )
        yr_baseline = pd.DataFrame(
                {
                    'lb_year': [year_lb],
                    'ub_year': [year_ub],
                    'baseline_side': [baseline_side],
                    'baseline': [baseline]
                }
            )
        baselines = pd.concat(
            [baselines, yr_baseline],
            axis=0
        ).reset_index(drop=True)
    baselines.to_csv(f'eda/year_baselines.csv', index=False)
    return baselines

get_all_year_baselines()

Unnamed: 0,lb_year,ub_year,baseline_side,baseline
0,1955,1955,1,0.516129
1,1956,1960,1,0.59129
2,1961,1961,1,0.705882
3,1993,1993,1,0.506173
4,1994,1998,1,0.601467
5,1999,1999,1,0.594595
6,2013,2013,1,0.681818
7,2014,2018,1,0.64127
8,2019,2019,1,0.596491


# Different Format

In [6]:
def get_judge_baseline(year_lb, year_ub):
    """
    Compute the baseline accuracy by judge for a given range of years

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year

    Outputs:
        - baslines (pd.DataFrame): Baselines by judge
    """
    # Load the data
    ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')

    baselines = pd.DataFrame()
    for year in range(year_lb, year_ub+1):
        # Count the number of cases vote by side
        vote_counts = ut[ut['year'] == year].groupby(
            'case_id'
            ).first()[
            [
                col for col in ut.columns if col.startswith('votes_side_')
            ]
        ].apply(
            lambda vote: vote.value_counts(dropna=True)
        )

        # Compute the baseline
        baseline = pd.DataFrame(vote_counts.apply(
                    lambda col: col.max() / col.sum(),
                    axis=0
                ).dropna(),
                columns=['baseline']
        )

        baseline['baseline_side'] = vote_counts.apply(
            lambda col: col.idxmax(),
            axis=0
        ).dropna()
        baseline['year'] = year
        baselines = pd.concat([baselines, baseline])
    
    return baselines[['year', 'baseline_side', 'baseline']]

In [7]:
def get_all_judge_baselines():
    """
    get the judge baselines for all years

    Input:
        - None
    
    Output:
        - baselines (pd.DataFrame): baselines
            for the different judges across the
            years
    """
    year_bounds = [
        (1955, 1955),
        (1956, 1960),
        (1961, 1961),
        (1993, 1993),
        (1994, 1998),
        (1999, 1999),
        (2013, 2013),
        (2014, 2018),
        (2019, 2019)
    ]

    baselines = pd.DataFrame()
    for year_lb, year_ub in year_bounds:
        yr_baseline = get_judge_baseline(
                year_lb,
                year_ub
            )
        yr_baseline['justice'] = yr_baseline.index
        yr_baseline['justice'] = yr_baseline['justice'].apply(
            lambda name: name.strip('votes_side_')
        )
        baselines = pd.concat(
            [baselines, yr_baseline],
            axis=0
        ).reset_index(drop=True)
    baselines = baselines[['year', 'justice', 'baseline_side', 'baseline']]
    baselines.to_csv(f'eda/judge_baselines.csv', index=False)
    return baselines

get_all_judge_baselines()

  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')
  ut = pd.read_csv(f'data/utterances_clean{year_lb}-{year_ub}.csv')


Unnamed: 0,year,justice,baseline_side,baseline
0,1955,j__earl_warren,1.0,0.604396
1,1955,j__felix_frankfurter,0.0,0.505376
2,1955,j__harold_burton,0.0,0.548387
3,1955,j__hugo_l_black,1.0,0.634409
4,1955,j__john_m_harlan2,0.0,0.520000
...,...,...,...,...
189,2019,j__neil_gorsuch,1.0,0.543860
190,2019,j__ruth_bader_ginsburg,1.0,0.631579
191,2019,j__samuel_a_alito_jr,1.0,0.508772
192,2019,j__sonia_sotomayor,1.0,0.500000


In [8]:
def get_year_baseline(year_lb, year_ub):
    """
    Compute the baseline accuracy at the case level
    for a given range of years

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year

    Outputs:
        - win_side (int): win side of the baseline
        - baseline_side (pd.Series): Baseline
    """
    # Load the data
    ut = pd.read_csv(
        f'data/utterances_clean{year_lb}-{year_ub}.csv',
        usecols=['year', 'case_id', 'win_side']
    )
    baselines = pd.DataFrame()
    for year in range(year_lb, year_ub+1):
        counts = ut[ut['year'] == year][['case_id', 'win_side']].groupby(
            'case_id'
        ).first().value_counts()
        baselines = pd.concat(
            [baselines,
                pd.DataFrame({
                    'year': [year],
                    'baseline_side': [int(counts.idxmax()[0])],
                    'baseline': [counts.max() / counts.sum()]
                })
            ], axis=0
        )
    return baselines

In [9]:
def get_all_year_baselines():
    """
    get the judge baselines for all years

    Input:
        - None
    
    Output:
        - baselines (pd.DataFrame): baselines
            for the different judges across the
            years
    """
    year_bounds = [
        (1955, 1955),
        (1956, 1960),
        (1961, 1961),
        (1993, 1993),
        (1994, 1998),
        (1999, 1999),
        (2013, 2013),
        (2014, 2018),
        (2019, 2019)
    ]

    baselines = pd.DataFrame()
    for year_lb, year_ub in year_bounds:
        baselines = pd.concat(
            [
                baselines,
                get_year_baseline(
                    year_lb,
                    year_ub
                )
            ],
            axis=0
        ).reset_index(drop=True)
    baselines.to_csv(f'eda/year_baselines.csv', index=False)
    return baselines

baselines = get_all_year_baselines()
baselines.describe()

Unnamed: 0,year,baseline_side,baseline
count,21.0,21.0,21.0
mean,1990.0,1.0,0.609801
std,24.734591,0.0,0.066152
min,1955.0,1.0,0.506173
25%,1960.0,1.0,0.565789
50%,1996.0,1.0,0.596491
75%,2014.0,1.0,0.641975
max,2019.0,1.0,0.758065
