In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from convokit import Utterance, Corpus, Coordination, download

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Set workding directory
os.chdir('C:\\Users\\Jonas\\Desktop\\UChicago\\term_6\\AdvancedMachineLearning\\project')

In [2]:
def get_clean_utterances(corpus):
    """
    Cleans utterances by cleaning the text, assessing who is addressed,
    dropping some irrelevant columns, and some other miscellaneous
    tasks.

    Input:
        corpus: Corpus object (usually from a given year)

    Output:
        utterances (pd.DataFrame): clean datafram containing utterances
    """

    # Fetch the utterances
    utterances = corpus.get_utterances_dataframe()

    # Clean the text
    utterances['text'] = utterances['text'].apply(
        lambda txt: txt.replace('\n', ' ')  # Filter such that irrelevant rows are removed (might be irrelevant if pytorch can read \n)
    )

    # Drop "useless" columns
    utterances.drop(
        [
            'timestamp', 'meta.start_times', 'meta.stop_times', 'vectors'
        ],
        axis=1,
        inplace=True
    )

    # Rename columns for clarity
    SUPERSCRIPT = 'meta.'
    colnames_to_transform = [
        col[len(SUPERSCRIPT):]
        for col in utterances.columns
        if col.startswith(SUPERSCRIPT)
    ]
    utterances.rename(
        {
            SUPERSCRIPT + col: col
            for col in colnames_to_transform
        },
        axis=1,
        inplace=True
    )

    # Create addressing ("lag" of index)
    utterances['addressing'] = None
    for idx, row in utterances.iterrows():
        reply_to = row['reply_to']
        if reply_to:
            utterances.loc[reply_to]['addressing'] = idx

    return utterances

In [3]:
def aggregate_years(lb_year, ub_year, case_info=True, same_resp_addr=False):
    """
    For a given range of year, the corpus of utterances is
    downloaded and cleaned. If desired, information regarding
    the cases is added.

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year
        - case_info (bool): Wheter case information should
            be included
        - same_resp_addr (bool): Whether the person responded to
            must also be the person addressed
            
    Returns:
        - clean_corpus (pd.DataFrame): The clean dataset
    """
    first = True
    for year in range(lb_year, ub_year+1):
        # Download the data
        ROOT_DIR = download(
            f'supreme-{year}',
            data_dir=os.getcwd()
        )
        
        # Clean a single year and then concat with previous ones
        if first:
            clean_corpus = get_clean_utterances(
                Corpus(
                    ROOT_DIR
                )
            )
            first = False
        else:
            clean_corpus = pd.concat(
                [
                    clean_corpus,
                    get_clean_utterances(
                        Corpus(
                            ROOT_DIR
                        )
                    )
                ]
            )
        
        # Delete the downloaded files
        os.remove(f'supreme-{year}.zip')
        shutil.rmtree(f'supreme-{year}')
    
    # Join the case info
    if case_info:
        ci = pd.read_csv(
            'case_info_relevant_cols_only.csv',
            index_col='id'
        )
        clean_corpus = clean_corpus.join(
            ci,
            on='case_id',
            how='left'
        )

    # Add names of those replied and spoken to
    clean_corpus = pd.merge(
        pd.merge(
            clean_corpus,
            clean_corpus[['speaker']],
            left_on='reply_to',
            right_index=True,
            how='left',
            suffixes=('', '_replied_to')
        ),
        clean_corpus[['speaker']],
        left_on='addressing',
        right_index=True,
        how='left',
        suffixes=('', '_addressed')
    )

    # Replied to be the same as the person addressed
    if same_resp_addr:
        clean_corpus = clean_corpus[
            clean_corpus['speaker_replied_to'] == clean_corpus['speaker_addressed']
        ]

    # Save the resulting datasets in the current directory
    clean_corpus.to_csv(f'utterances_clean{lb_year}-{ub_year}.csv')
    # clean_corpus.to_json(f'utterances_clean{lb_year}-{ub_year}.json')

    return clean_corpus

# Make sure to have 'case_info_relevant_cols_only.csv' saved
# in the current directory prior to running

############### VERY IMPORTANT ###############
# For the code to work, go to convokit and on line 118 of util.py,
# set needs_download = True.
ut = aggregate_years(1975, 1975, case_info=True, same_resp_addr=False)
display(ut)

Downloading supreme-1975 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1975
Downloading supreme-1975 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-1975.zip (15.0MB)... Done


Unnamed: 0_level_0,text,speaker,reply_to,conversation_id,case_id,speaker_type,side,timestamp,addressing,year,...,advocates_18_id,advocates_18_side,advocates_19_id,advocates_19_side,advocates_20_id,advocates_20_side,advocates_21_id,advocates_21_side,speaker_replied_to,speaker_addressed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16360__0_000,We will hear arguments next in Abbott Laborato...,j__warren_e_burger,,16360,1975_74-1274,J,,0.0,16360__0_001,1975,...,,,,,,,,,,james_h_clarke
16360__0_001,Mr. Chief Justice and may it please the Court....,james_h_clarke,16360__0_000,16360,1975_74-1274,A,3,65.016,16360__0_002,1975,...,,,,,,,,,j__warren_e_burger,j__warren_e_burger
16360__0_002,But laying aside all of the others categories ...,j__warren_e_burger,16360__0_001,16360,1975_74-1274,J,,197.289,16360__0_003,1975,...,,,,,,,,,james_h_clarke,james_h_clarke
16360__0_003,This would depend upon the circumstances of a ...,james_h_clarke,16360__0_002,16360,1975_74-1274,A,3,214.563,16360__0_004,1975,...,,,,,,,,,j__warren_e_burger,j__byron_r_white
16360__0_004,What about this refilling prescription in hosp...,j__byron_r_white,16360__0_003,16360,1975_74-1274,J,,253.915,16360__0_005,1975,...,,,,,,,,,james_h_clarke,james_h_clarke
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17904__3_025,"Mrs. Reilly, I have, it we can get it straight...",j__potter_stewart,17904__3_024,17904,1975_75-312,J,,4016.385,17904__3_026,1975,...,,,,,,,,,maureen_pulte_reilly,maureen_pulte_reilly
17904__3_026,"That’s correct, Your Honor.",maureen_pulte_reilly,17904__3_025,17904,1975_75-312,A,1,4055.447,17904__3_027,1975,...,,,,,,,,,j__potter_stewart,j__potter_stewart
17904__3_027,So that this is true but it’s also -- this rec...,j__potter_stewart,17904__3_026,17904,1975_75-312,J,,4057.546,17904__3_028,1975,...,,,,,,,,,maureen_pulte_reilly,maureen_pulte_reilly
17904__3_028,That’s correct. It did not go far enough to ex...,maureen_pulte_reilly,17904__3_027,17904,1975_75-312,A,1,4063.696,17904__3_029,1975,...,,,,,,,,,j__potter_stewart,j__warren_e_burger


In [4]:
year_bounds = [
    (1956, 1956),
    (1957, 1960),
    (1961, 1961),
    (1993, 1993),
    (1994, 1998),
    (1999, 1999),
    (2013, 2013),
    (2014, 2018),
    (2019, 2019)
]

for year_lb, year_ub in year_bounds:
    aggregate_years(year_lb, year_ub, case_info=True, same_resp_addr=False)

Downloading supreme-1956 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1956
Downloading supreme-1956 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-1956.zip (18.2MB)... Done
Downloading supreme-1957 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1957
Downloading supreme-1957 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-1957.zip (20.0MB)... Done
Downloading supreme-1958 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1958
Downloading supreme-1958 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-1958.zip (18.8MB)... Done
Downloading supreme-1959 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1959
Downloading supreme-1959 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-1959.zip (19.0MB)... Done
Downloading supreme-1960 to 