In [1]:
import os
import numpy as np
import pandas as pd
from convokit import Utterance, Corpus, Coordination, download

# Set workding directory
os.chdir('C:\\Users\\Jonas\\Desktop\\UChicago\\term_6\\AdvancedMachineLearning\\project')

In [2]:
# Define the years to fetch
LB_YEAR = 1994
UB_YEAR = 1998
years = list(range(LB_YEAR, UB_YEAR+1))
years

[1994, 1995, 1996, 1997, 1998]

In [3]:
def get_clean_utterances(corpus):
    """
    Cleans utterances by cleaning the text, extracting the year,
    dropping some irrelevant columns, and some other miscellaneous
    tasks.

    Input:
        corpus: Corpus object (usually from a given year)

    Output:
        utterances (pd.DataFrame): clean datafram containing utterances
    """

    # Fetch the utterances
    utterances = corpus.get_utterances_dataframe()

    # Clean the text
    utterances['text'] = utterances['text'].apply(
        lambda txt: txt.replace('\n', ' ')  # Filter such that irrelevant rows are removed (might be irrelevant if pytorch can read \n)
    )

    # Extract the year
    utterances['year'] = utterances['meta.case_id'].apply(
        lambda meta_id: meta_id.split('_')[0]
    )

    # Clean speaker name
    utterances['speaker'] = utterances['speaker'].apply(
        lambda speaker: speaker.replace('j__', '')
    )

    # Drop "useless" columns
    utterances.drop(
        [
            'timestamp', 'reply_to', 'meta.case_id',
            'meta.start_times', 'meta.stop_times', 'vectors'
        ],
        axis=1,
        inplace=True
    )

    # Rename columns for clarity
    SUPERSCRIPT = 'meta.'
    colnames_to_transform = [
        col[len(SUPERSCRIPT):]
        for col in utterances.columns
        if col.startswith(SUPERSCRIPT)
    ]
    utterances.rename(
        {
            SUPERSCRIPT + col: col
            for col in colnames_to_transform
        },
        axis=1,
        inplace=True
    )
    
    return utterances

In [4]:
first = True
for year in years:
    # Download the data
    ROOT_DIR = download(f'supreme-{year}', data_dir='C:/Users/Jonas/Desktop/UChicago/term_6/AdvancedMachineLearning/project')
    if first:
        clean_corpus = get_clean_utterances(
            Corpus(
                ROOT_DIR
            )
        )
        first = False
    else:
        clean_corpus = pd.concat(
            [
                clean_corpus,
                get_clean_utterances(
                    Corpus(
                        ROOT_DIR
                    )
                )
            ]
        )
clean_corpus.to_csv(f'utterances_clean{LB_YEAR}-{UB_YEAR}.csv')
clean_corpus.to_json(f'utterances_clean{LB_YEAR}-{UB_YEAR}.json')
clean_corpus.head()

Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1994
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1995
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1996
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1997
Dataset already exists at C:\Users\Jonas\Desktop\UChicago\term_6\AdvancedMachineLearning\project\supreme-1998


Unnamed: 0_level_0,text,speaker,conversation_id,speaker_type,side,timestamp,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20706__0_000,We'll hear argument first this morning in Numb...,william_h_rehnquist,20706,J,,0.0,1994
20706__0_001,Mr. Chief Justice and may it please the Court:...,william_perry_pendley,20706,A,1.0,12.724,1994
20706__0_002,Do we know that that was the reason for the re...,antonin_scalia,20706,J,,87.706,1994
20706__0_003,"Absolutely, Justice Scalia. When we look at th...",william_perry_pendley,20706,A,1.0,90.141,1994
20706__0_004,Does that clarify that it was the presumption ...,david_h_souter,20706,J,,108.819,1994
