## Packages

In [9]:
import json
import re
import requests
import os
import pandas as pd

## Arguments and User Defined Functions

In [10]:
years = ("2013", "2020")  # Years of cases to collection, Inclusive, Strings
n_limit = 5  # For debugging purposes, leave as `None`` otherwise.
cluster_shell_start = None  # Specific to the environment working on.
# cluster_shell_start = [
#         "#!/bin/bash \n",
#         "#SBATCH --nodes=1 \n",
#         "#SBATCH --ntasks-per-node=1 \n",
#         "#SBATCH --cpus-per-task=1 \n",
#         "#SBATCH --time=5:00:00 \n",
#         "#SBATCH --mem=2GB \n",
#         "#SBATCH --job-name=get_oyez_mp3s \n",
#         "\n",
#     ]
mp3_output_dir = "../outputs/mp3s/"  # Include ending backslash
wav_output_dir = "../outputs/wavs/"  # Include ending backslash
if not os.path.exists(mp3_output_dir):
    os.makedirs(mp3_output_dir)
if not os.path.exists(wav_output_dir):
    os.makedirs(wav_output_dir)

In [11]:
def get_http_json(url):
    # print(f"Getting {url}")
    response = requests.get(url)
    parsed = response.json()
    return parsed


def get_case(term, docket):
    """Get the info of the case and fetch all
    transcripts that the info links to"""
    url = f"https://api.oyez.org/cases/{term}/{docket}"
    docket_data = get_http_json(url)

    if not (
        "oral_argument_audio" in docket_data and docket_data["oral_argument_audio"]
    ):
        # no oral arguments for this case yet
        # fail so we will try again later
        print(f"No oral arguments for docket {docket}")
        return (docket_data, [])

    oral_argument_audio = docket_data["oral_argument_audio"]
    transcripts = []
    for link in oral_argument_audio:
        t = get_http_json(link["href"])
        transcripts.append(t)

    return docket_data, transcripts


def getAudio(transcripts):
    num_files = len(transcripts)
    audio_list = []
    for t in transcripts:
        media_dicts = t["media_file"]
        if media_dicts[0] is not None:  # handle weird cases
            # just incase theres more than one, there shouldnt be but they re in a weird list
            for media_dict in media_dicts:
                audio_list.append(media_dict["href"])
    return [num_files, audio_list]


# gets transcript along with metadata
def getTranscript(transcripts):
    transcript_list = []
    speaker_list = []
    speaker_type_list = []
    time_list = []

    # parse through many levels of json file
    for t in transcripts:
        sections = t["transcript"]["sections"]
        for section in sections:
            turns = section["turns"]

            for turn in turns:

                # collect speaker
                try:
                    speaker = turn["speaker"]["name"]
                except:
                    speaker = "<UNK>"
                speaker_list.append(speaker)

                # collect speaker type
                try:
                    roles = turn["speaker"]["roles"]

                    if isinstance(turn["speaker"]["roles"], list):
                        roles = turn["speaker"]["roles"]
                        multiple_roles = []
                        for role in roles:
                            multiple_roles.append(role["type"])
                        speaker_type_list.append(multiple_roles)

                    else:
                        speaker_type_list.append(
                            ["Other"]
                        )  # Other is most likely Lawyer
                except:
                    speaker_type_list.append(["Other"])

                # collect text and time
                texts = turn["text_blocks"]
                texts_out = []
                times_out = []
                for text in texts:
                    texts_out.append(text["text"])
                    times_out.append((text["start"], text["stop"]))

                transcript_list.append(texts_out)
                time_list.append(times_out)

    return transcript_list, speaker_list, speaker_type_list, time_list


def getMeta(docket, data):

    # get meta data as well as rearrange to desirable formal
    transcript, speakers, speaker_roles, times = data[docket]

    # Flatten times list
    times_new = []
    for t in times:
        flatten = [item for sublist in t for item in sublist]
        times_new.append(flatten)
    # Last element of list is a 0 - cleanup
    del times_new[-1][-1]

    # Flatten speaker_roles list and replace nulls with "Other"
    speaker_roles_clean = []
    for i in speaker_roles:
        if not i:
            speaker_roles_clean.append("Other")
        else:
            speaker_roles_clean.append(i[0])

    # Remove all non-word characters in speakers' names
    speakers = [re.sub(r"[^\w\s]", "", s) for s in speakers]
    # Replace all runs of whitespace with underscorei in speakers' names
    speakers = [re.sub(r"\s+", "_", s) for s in speakers]

    return transcript, speakers, speaker_roles_clean, times_new

## Build Query List

Get all case information

In [5]:
# case_summaries can be retrieved through this API call: https://api.oyez.org/cases?per_page=0
# case_summaries = requests.get("https://api.oyez.org/cases?per_page=0")
# case_summaries = pd.DataFrame(case_summaries.json())

case_summaries = pd.read_json("../outputs/case_summaries.json")
case_summaries = case_summaries[["term", "docket_number"]]

case_summaries_filtered = case_summaries[
    (case_summaries["term"] >= years[0]) & (case_summaries["term"] <= years[1])
]

if n_limit:
    case_summaries_filtered = case_summaries_filtered.head(n=n_limit)

In [6]:
case_summaries_filtered.shape

(5, 2)

Get audio download links for filtered cases

In [7]:
data = {}

for term, docket_number in case_summaries_filtered.itertuples(index=False):
    docket_data, transcripts = get_case(term, docket_number)
    data[docket_number] = transcripts

Get audio data

In [8]:
audio_data = {}

missing_transcripts = []
missing_audio = []
for docket, transcript in data.items():
    if bool(data[docket]) and type(data[docket][0]["transcript"]) == dict:
        if getAudio(data[docket])[0] == 1:
            temp = getAudio(data[docket])[1]
            if len(temp) > 0:
                # Found empty result, error-proofing
                audio_data[docket] = temp[0]  # s3 link
            else:
                missing_audio.append(docket)
        else:
            missing_audio.append(docket)
    else:
        missing_transcripts.append(docket)

In [9]:
print("Dockets with no transcript information:", len(missing_transcripts))
print("Docket with no audio files:", len(missing_audio))
print("Collected Audio Data Links:", len(audio_data.keys()))

Dockets with no transcript information: 0
Docket with no audio files: 0
Collected Audio Data Links: 5


## Converting to CURL commands

In [10]:
curl_script = open("../scripts/mp3_curl_cmds.sh", "w")

if cluster_shell_start:
    curl_script.writelines(cluster_shell_start)

for docket, s3_link in audio_data.items():
    curl_script.write(f"curl -L {s3_link} -o {mp3_output_dir}{docket}.mp3 \n")

curl_script.close()

Run as `bash` script

In [11]:
%%capture
# remove `capture` magic to see output
!bash ../scripts/mp3_curl_cmds.sh

## Generating Metadata

In [12]:
mp3_meta_data = {}

#  1. All have transcripts
#  2. All have just 1 mp3 file
for docket in audio_data.keys():
    transcript_list, speaker_list, speaker_type_list, time_list = getTranscript(
        data[docket]
    )
    mp3_meta_data[docket] = transcript_list, speaker_list, speaker_type_list, time_list

with open("../outputs/oyez_metadata.json", "w+") as f:
    # use json.dump(mp3_meta_data, f, indent=4) to "pretty-print" with four spaces per indent
    json.dump(mp3_meta_data, f)

## Convert to WAVs
Bash Script

In [13]:
# If windows, run the following before: `dos2unix ../outputs/mp3_to_wav_win.sh`
# A little buggy. Might need to reset it. Requires `ffmpeg` installed on system.
!bash ../scripts/mp3_to_wav.sh ../outputs/mp3s/ ../outputs/wavs/

^C


## Make Transcripts

In [15]:
with open("../outputs/oyez_metadata.json") as f:
    data = json.load(f)

saved_dockets = []
for file in os.listdir("../outputs/wavs/"):
    if file.endswith(".wav"):
        saved_dockets.append(file.split(".")[0])

infos = []
# # Create transcript for wav files saved if certain criteria check out
for docket in saved_dockets:
    transcript, speakers, speaker_roles, times_new = getMeta(docket, data)
    if len(transcript) == len(speakers) == len(speaker_roles) == len(times_new):
        case_info = pd.DataFrame(
            data={
                "times": times_new,
                "speaker": speakers,
                "speaker_role": speaker_roles,
                "text": transcript,
            }
        )
        case_info["file"] = docket
        case_info["line"] = case_info.index
        case_info["start"] = case_info["times"].apply(lambda x: x[0])
        case_info["end"] = case_info["times"].apply(lambda x: x[-1])
        case_info["duration"] = case_info["end"] - case_info["start"]
        case_info["duration"] = case_info["duration"].apply(lambda x: round(x, 3))
        case_info["text"] = case_info["text"].apply(lambda x: " ".join(x))
        case_info["word_count"] = case_info["text"].apply(lambda x: len(x.split(" ")))

        case_info = case_info[
            [
                "file",
                "line",
                "start",
                "end",
                "speaker",
                "speaker_role",
                "word_count",
                "duration",
                "text",
            ]
        ]
        infos.append(case_info)

all_info = pd.concat(infos)
all_info = all_info.loc[all_info["speaker_role"] == "scotus_justice"].reset_index(
    drop=True
)

all_info['year'] = all_info["file"].apply(lambda x: x.split("-")[0])

all_info.to_csv("../outputs/all_transcripts.csv", index=False)

In [14]:
all_info.sample(n=4)

Unnamed: 0,file,line,start,end,speaker,speaker_role,word_count,duration,text,year
64,12-1036,128,2172.987,2173.456,Elena_Kagan,scotus_justice,2,0.469,--consolidation case--,12
17,12-1036,34,825.443,827.177,John_G_Roberts_Jr,scotus_justice,6,1.734,"Well, I know but I'm trying--",12
12,12-1036,24,508.195,523.704,John_G_Roberts_Jr,scotus_justice,46,15.509,What if you have an executor and he's administ...,12
499,12-682,75,1713.7,1723.223,John_G_Roberts_Jr,scotus_justice,32,9.523,"And they did, and then after several years the...",12
