## Packages

In [2]:
import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa
import re
import scipy

from IPython.display import Audio, clear_output, display

## Arguments & User Defined Functions

In [6]:
transcript_path = "../outputs/all_transcripts_v2.csv"
transcripts = pd.read_csv(transcript_path)

# # Only for sample purposes:
# file_path = "142-orig.wav"
# file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores.csv"
bert_scores = pd.read_csv(bert_scores_path)

transcripts = transcripts.merge(bert_scores, on=["file", "line"])
transcripts["line"] = transcripts["line"].astype(str)
try:
    transcripts = transcripts.drop("Unnamed: 0", axis=1)
except:
    pass

data_path = "../outputs/npy"

case_summaries = pd.read_json("../outputs/case_summaries.json")

batch_size = 32
num_workers = 1

sequence_len = 2048  # np.max(np.load("../outputs/npy/142-orig.wav_shapes.npy"))
write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

# Transcript Reduction
Removing transcript lines that had errors during the audio feature extraction step.

In [7]:
npys = [path for path in Path(data_path).rglob("*.npy")]

npys_name = [n.name for n in npys]
npys_name = pd.DataFrame(npys_name, columns=["name"])
sequences = npys_name.loc[~npys_name["name"].str.contains("shape")].copy()
shapes = npys_name.loc[npys_name["name"].str.contains("shape")].copy()

print("Sequence Arrays:", len(sequences))
sequences["splits"] = sequences["name"].apply(lambda x: x.split("_"))
sequences["file"] = sequences["splits"].apply(lambda x: x[0])
sequences["line"] = sequences["splits"].apply(lambda x: x[1].split(".")[0])

transcripts = transcripts.merge(
    sequences[["file", "line"]], on=["file", "line"]
).reset_index(drop=True)

print("Matching Sequences:", len(transcripts))

transcripts = transcripts.merge(
    case_summaries[["docket_number", "term"]],
    left_on="file",
    right_on="docket_number",
    how="left",
)

transcripts.head()

Sequence Arrays: 564
Matching Sequences: 564


Unnamed: 0,file,line,start,end,speaker,speaker_role,word_count,duration,text,gs_score,docket_number,term
0,12-1036,0,0.0,8.951,John_G_Roberts_Jr,scotus_justice,15,8.951,We'll hear argument next today in Case 12-1036...,-1.815591,12-1036,2013
1,12-1036,2,117.344,137.275,Ruth_Bader_Ginsburg,scotus_justice,37,19.931,"Mr. Massey, with respect to that, there is a c...",-0.965072,12-1036,2013
2,12-1036,4,211.401,212.291,Anthony_M_Kennedy,scotus_justice,8,0.89,"And you're asking for $10,000 for each one?",-1.092182,12-1036,2013
3,12-1036,6,213.338,216.259,Anthony_M_Kennedy,scotus_justice,7,2.921,"You're asking for $10,000 for each one.",-0.686814,12-1036,2013
4,12-1036,8,231.831,247.106,Sonia_Sotomayor,scotus_justice,28,15.275,Let's assume a case where -- forget about that...,-0.2547,12-1036,2013


# Collect Summary Statistics
Each of the five dimensions (3 pitch, 2 onset related) has their mean and standard deviation calculated

Done on the:
- Segment Level
- Speaker & Year Level
- Speaker Level 

In [8]:
speakers = transcripts["speaker"].unique()
seg_infos = []
speaker_infos = []
spkr_yr_infos = []

tracker = 0
for s in speakers:
    s_transcripts = transcripts.loc[transcripts["speaker"] == s]
    speaker_arrays = []
    years = s_transcripts["term"].unique()

    for y in years:
        s_y_transcripts = s_transcripts.loc[transcripts["term"] == y]
        speaker_year_arrays = []

        for r in s_y_transcripts.iterrows():
            tracker += 1
            row = r[1]

            clear_output(wait=True)
            print(
                "Item:",
                tracker,
                " |  Progress:",
                f"{round(100*(tracker/transcripts.shape[0]))}%",
                " |  File:",
                row["file"],
                " |  Line:",
                row["line"],
            )

            # Load Data
            seg_arr = np.load(osp.join(data_path, f"{row['file']}_{row['line']}.npy"))
            speaker_year_arrays.append(seg_arr)
            speaker_arrays.append(seg_arr)

            # Segment Level Summary
            seg_means = np.nanmean(seg_arr, axis=0)
            seg_stds = np.nanstd(seg_arr, axis=0)
            seg_info = {
                "file": row["file"],
                "line": row["line"],
                "speaker": s,
                "year": y,
            }
            seg_info.update(
                dict(
                    zip(
                        [
                            "seg_f0_mean",
                            "seg_voiced_flag_mean",
                            "seg_voiced_prob_mean",
                            "seg_onset_strength_mean",
                            "seg_onset_flag_mean",
                        ],
                        seg_means,
                    )
                )
            )
            seg_info.update(
                dict(
                    zip(
                        [
                            "seg_f0_std",
                            "seg_voiced_flag_std",
                            "seg_voiced_prob_std",
                            "seg_onset_strength_std",
                            "seg_onset_flag_std",
                        ],
                        seg_stds,
                    )
                )
            )
            seg_infos.append(seg_info)

        # Speaker-Year Level Summary
        spkr_yr_arr = np.concatenate(speaker_year_arrays, axis=0)
        spkr_yr_means = np.nanmean(spkr_yr_arr, axis=0)
        spkr_yr_stds = np.nanstd(spkr_yr_arr, axis=0)
        spkr_yr_info = {"speaker": s, "year": y}
        spkr_yr_info.update(
            dict(
                zip(
                    [
                        "spkr_yr_f0_mean",
                        "spkr_yr_voiced_flag_mean",
                        "spkr_yr_voiced_prob_mean",
                        "spkr_yr_onset_strength_mean",
                        "spkr_yr_onset_flag_mean",
                    ],
                    spkr_yr_means,
                )
            )
        )
        spkr_yr_info.update(
            dict(
                zip(
                    [
                        "spkr_yr_f0_std",
                        "spkr_yr_voiced_flag_std",
                        "spkr_yr_voiced_prob_std",
                        "spkr_yr_onset_strength_std",
                        "spkr_yr_onset_flag_std",
                    ],
                    spkr_yr_stds,
                )
            )
        )
        spkr_yr_infos.append(spkr_yr_info)

    # Speaker Level Summary
    speaker_arr = np.concatenate(speaker_arrays, axis=0)
    speaker_means = np.nanmean(speaker_arr, axis=0)
    speaker_stds = np.nanstd(speaker_arr, axis=0)
    speaker_info = {"speaker": s}
    speaker_info.update(
        dict(
            zip(
                [
                    "speaker_f0_mean",
                    "speaker_voiced_flag_mean",
                    "speaker_voiced_prob_mean",
                    "speaker_onset_strength_mean",
                    "speaker_onset_flag_mean",
                ],
                speaker_means,
            )
        )
    )
    speaker_info.update(
        dict(
            zip(
                [
                    "speaker_f0_std",
                    "speaker_voiced_flag_std",
                    "speaker_voiced_prob_std",
                    "speaker_onset_strength_std",
                    "speaker_onset_flag_std",
                ],
                speaker_stds,
            )
        )
    )
    speaker_infos.append(speaker_info)

Item: 564  |  Progress: 100%  |  File: 12-682  |  Line: 203


In [9]:
speaker_df = pd.DataFrame(speaker_infos)
speaker_year_df = pd.DataFrame(spkr_yr_infos)
segment_df = pd.DataFrame(seg_infos)
segment_df.shape

(564, 14)

In [10]:
segment_df = segment_df.merge(speaker_year_df, on=["speaker", "year"]).merge(
    speaker_df, on="speaker"
)
segment_df.shape

(564, 34)

In [11]:
segment_df.to_csv("../outputs/nn_summary_info.csv", index=False)

In [12]:
segment_df.head(n=4)

Unnamed: 0,file,line,speaker,year,seg_f0_mean,seg_voiced_flag_mean,seg_voiced_prob_mean,seg_onset_strength_mean,seg_onset_flag_mean,seg_f0_std,...,speaker_f0_mean,speaker_voiced_flag_mean,speaker_voiced_prob_mean,speaker_onset_strength_mean,speaker_onset_flag_mean,speaker_f0_std,speaker_voiced_flag_std,speaker_voiced_prob_std,speaker_onset_strength_std,speaker_onset_flag_std
0,12-1036,0,John_G_Roberts_Jr,2013,4.363524,0.582143,0.183272,0.801759,0.275,3.699426,...,4.002801,0.538862,0.124359,0.878949,0.244959,3.71431,0.498487,0.227845,1.228745,0.430063
1,12-1036,24,John_G_Roberts_Jr,2013,2.920594,0.404124,0.087015,0.82452,0.243299,3.548565,...,4.002801,0.538862,0.124359,0.878949,0.244959,3.71431,0.498487,0.227845,1.228745,0.430063
2,12-1036,30,John_G_Roberts_Jr,2013,3.36865,0.466392,0.079881,0.958205,0.262003,3.609395,...,4.002801,0.538862,0.124359,0.878949,0.244959,3.71431,0.498487,0.227845,1.228745,0.430063
3,12-1036,32,John_G_Roberts_Jr,2013,3.526608,0.490033,0.109389,0.89785,0.229236,3.601802,...,4.002801,0.538862,0.124359,0.878949,0.244959,3.71431,0.498487,0.227845,1.228745,0.430063
