In [1]:
import os
import numpy as np
import pandas as pd
from convokit import Utterance, Corpus, Coordination, download

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Set workding directory
os.chdir('C:\\Users\\Jonas\\Desktop\\UChicago\\term_6\\AdvancedMachineLearning\\project')

In [2]:
def get_clean_utterances(corpus):
    """
    Cleans utterances by cleaning the text, assessing who is addressed,
    dropping some irrelevant columns, and some other miscellaneous
    tasks.

    Input:
        corpus: Corpus object (usually from a given year)

    Output:
        utterances (pd.DataFrame): clean datafram containing utterances
    """

    # Fetch the utterances
    utterances = corpus.get_utterances_dataframe()

    # Clean the text
    utterances['text'] = utterances['text'].apply(
        lambda txt: txt.replace('\n', ' ')  # Filter such that irrelevant rows are removed (might be irrelevant if pytorch can read \n)
    )

    # Drop "useless" columns
    utterances.drop(
        [
            'timestamp', 'meta.start_times', 'meta.stop_times', 'vectors'
        ],
        axis=1,
        inplace=True
    )

    # Rename columns for clarity
    SUPERSCRIPT = 'meta.'
    colnames_to_transform = [
        col[len(SUPERSCRIPT):]
        for col in utterances.columns
        if col.startswith(SUPERSCRIPT)
    ]
    utterances.rename(
        {
            SUPERSCRIPT + col: col
            for col in colnames_to_transform
        },
        axis=1,
        inplace=True
    )

    # Create addressing ("lag" of index)
    utterances['addressing'] = None
    for idx, row in utterances.iterrows():
        reply_to = row['reply_to']
        if reply_to:
            utterances.loc[reply_to]['addressing'] = idx

    return utterances

In [3]:
def aggregate_years(lb_year, ub_year, case_info=True):
    """
    For a given range of year, the corpus of utterances is
    downloaded and cleaned. If desired, information regarding
    the cases is added.

    Inputs:
        - lb_year (int): Lower bound year
        - ub_year (int): Upper bound year
        - case_info (bool): Wheter case information should
            be included
            
    Returns:
        - clean_corpus (pd.DataFrame): The clean dataset
    """
    first = True
    for year in range(lb_year, ub_year+1):
        # Download the data
        ROOT_DIR = download(
            f'supreme-{year}',
            data_dir=os.getcwd()
        )
        
        # Clean a single year and then concat with previous ones
        if first:
            clean_corpus = get_clean_utterances(
                Corpus(
                    ROOT_DIR
                )
            )
            first = False
        else:
            clean_corpus = pd.concat(
                [
                    clean_corpus,
                    get_clean_utterances(
                        Corpus(
                            ROOT_DIR
                        )
                    )
                ]
            )
    
    # Join the case info
    if case_info:
        ci = pd.read_csv(
            'case_info_relevant_cols_only.csv',
            index_col='id'
        )
        clean_corpus = clean_corpus.join(
            ci,
            on='case_id',
            how='left'
        )

    # Save the resulting datasets in the current directory
    clean_corpus.to_csv(f'utterances_clean{lb_year}-{ub_year}.csv')
    clean_corpus.to_json(f'utterances_clean{lb_year}-{ub_year}.json')

    return clean_corpus

# Make sure to have 'case_info_relevant_cols_only.csv' saved
# in the current directory prior to running
ut = aggregate_years(2014, 2018, case_info=True)
display(ut)

Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-2014
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-2015
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-2016
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-2017
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-2018


Unnamed: 0_level_0,text,speaker,reply_to,conversation_id,case_id,speaker_type,side,timestamp,addressing,year,...,advocates_17_id,advocates_17_side,advocates_18_id,advocates_18_side,advocates_19_id,advocates_19_side,advocates_20_id,advocates_20_side,advocates_21_id,advocates_21_side
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23291__0_000,"We'll hear argument next in Case No. 13-553, t...",j__john_g_roberts_jr,,23291,2014_13-553,J,,0.0,23291__0_001,2014,...,,,,,,,,,,
23291__0_001,"Thank you, Mr. Chief Justice, and may it pleas...",andrew_l_brasher,23291__0_000,23291,2014_13-553,A,1,8.72,23291__0_002,2014,...,,,,,,,,,,
23291__0_002,"Well, is said that -- it said that in -- in (b...",j__antonin_scalia,23291__0_001,23291,2014_13-553,J,,41.872,23291__0_003,2014,...,,,,,,,,,,
23291__0_003,Right. I -- but I think--,andrew_l_brasher,23291__0_002,23291,2014_13-553,A,1,51.144,23291__0_004,2014,...,,,,,,,,,,
23291__0_004,"Another tax that discriminates is all it says,...",j__antonin_scalia,23291__0_003,23291,2014_13-553,J,,54.263,23291__0_005,2014,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24615__2_009,-- they can designate it and then a deliberate...,j__sonia_sotomayor,24615__2_008,24615,2018_17-71,J,,3648.16,24615__2_010,2018,...,,,,,,,,,,
24615__2_010,"Could I answer that question, Chief Justice?",timothy_s_bishop,24615__2_009,24615,2018_17-71,A,1,3657.96,24615__2_011,2018,...,,,,,,,,,,
24615__2_011,Briefly.,j__john_g_roberts_jr,24615__2_010,24615,2018_17-71,J,,3659.48,24615__2_012,2018,...,,,,,,,,,,
24615__2_012,"I mean, you know, the government has made abso...",timothy_s_bishop,24615__2_011,24615,2018_17-71,A,1,3660.12,24615__2_013,2018,...,,,,,,,,,,
