In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from convokit import Utterance, Corpus, Coordination, download

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Set workding directory
os.chdir('C:\\Users\\Jonas\\Desktop\\UChicago\\term_6\\AdvancedMachineLearning\\supreme_court_nlp')

In [2]:
def get_clean_utterances(corpus):
    """
    Cleans utterances by cleaning the text, assessing who is addressed,
    dropping some irrelevant columns, and some other miscellaneous
    tasks.

    Input:
        corpus: Corpus object (usually from a given year)

    Output:
        utterances (pd.DataFrame): clean datafram containing utterances
    """

    # Fetch the utterances
    utterances = corpus.get_utterances_dataframe()

    # Clean the text
    utterances['text'] = utterances['text'].apply(
        lambda txt: txt.replace('\n', ' ')  # Filter such that irrelevant rows are removed (might be irrelevant if pytorch can read \n)
    )

    # Drop "useless" columns
    utterances.drop(
        [
            'timestamp', 'meta.start_times', 'meta.stop_times', 'vectors'
        ],
        axis=1,
        inplace=True
    )

    # Rename columns for clarity
    SUPERSCRIPT = 'meta.'
    colnames_to_transform = [
        col[len(SUPERSCRIPT):]
        for col in utterances.columns
        if col.startswith(SUPERSCRIPT)
    ]
    utterances.rename(
        {
            SUPERSCRIPT + col: col
            for col in colnames_to_transform
        },
        axis=1,
        inplace=True
    )

    # Create addressing ("lag" of index)
    utterances['addressing'] = None
    for idx, row in utterances.iterrows():
        reply_to = row['reply_to']
        if reply_to:
            utterances.loc[reply_to]['addressing'] = idx

    return utterances

In [3]:
def aggregate_years(lb_year, ub_year, case_info=True, same_resp_addr=False):
    """
    For a given range of year, the corpus of utterances is
    downloaded and cleaned. If desired, information regarding
    the cases is added.

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year
        - case_info (bool): Wheter case information should
            be included
        - same_resp_addr (bool): Whether the person responded to
            must also be the person addressed
            
    Returns:
        - clean_corpus (pd.DataFrame): The clean dataset
    """
    first = True
    for year in range(lb_year, ub_year+1):
        # Download the data
        ROOT_DIR = download(
            f'supreme-{year}',
            data_dir=os.getcwd()
        )
        
        # Clean a single year and then concat with previous ones
        if first:
            clean_corpus = get_clean_utterances(
                Corpus(
                    ROOT_DIR
                )
            )
            first = False
        else:
            clean_corpus = pd.concat(
                [
                    clean_corpus,
                    get_clean_utterances(
                        Corpus(
                            ROOT_DIR
                        )
                    )
                ]
            )
        
        # Delete the downloaded files
        os.remove(f'supreme-{year}.zip')
        shutil.rmtree(f'supreme-{year}')
    
    # Join the case info
    if case_info:
        ci = pd.read_csv(
            'case_info_parsing/case_info_relevant_cols_only.csv',
            index_col='id'
        )
        clean_corpus = clean_corpus.join(
            ci,
            on='case_id',
            how='left'
        )

    # Add names of those replied and spoken to
    clean_corpus = pd.merge(
        pd.merge(
            clean_corpus,
            clean_corpus[['speaker', 'speaker_type']],
            left_on='reply_to',
            right_index=True,
            how='left',
            suffixes=('', '_replied_to')
        ),
        clean_corpus[['speaker', 'speaker_type']],
        left_on='addressing',
        right_index=True,
        how='left',
        suffixes=('', '_addressed')
    )

    # Replied to be the same as the person addressed
    if same_resp_addr:
        clean_corpus = clean_corpus[
            clean_corpus['speaker_replied_to'] == clean_corpus['speaker_addressed']
        ]

    # Save the resulting datasets in the current directory
    clean_corpus.to_csv(f'data/utterances_clean{lb_year}-{ub_year}.csv')
    # clean_corpus.to_json(f'utterances_clean{lb_year}-{ub_year}.json')

    return clean_corpus

# Make sure to have 'case_info_relevant_cols_only.csv' saved
# in the current directory prior to running

############### VERY IMPORTANT ###############
# For the code to work, go to convokit and on line 118 of util.py,
# set needs_download = True.
# ut = aggregate_years(1975, 1975, case_info=True, same_resp_addr=False)
# display(ut)

In [5]:
year_bounds = [
    (2000, 2004),
    (2005, 2009),
    (2010, 2014),
    (2015, 2019)
]

for year_lb, year_ub in year_bounds:
    aggregate_years(year_lb, year_ub, case_info=True, same_resp_addr=True)

Downloading supreme-2010 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\supreme_court_nlp\supreme-2010
Downloading supreme-2010 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-2010.zip (8.8MB)... Done
Downloading supreme-2011 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\supreme_court_nlp\supreme-2011
Downloading supreme-2011 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-2011.zip (8.6MB)... Done
Downloading supreme-2012 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\supreme_court_nlp\supreme-2012
Downloading supreme-2012 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-2012.zip (8.9MB)... Done
Downloading supreme-2013 to C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\supreme_court_nlp\supreme-2013
Downloading supreme-2013 from http://zissou.infosci.cornell.edu/convokit/datasets/supreme-corpus/supreme-2013.zip (8.1MB).