## Packages

In [2]:
import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa
import re

from IPython.display import Audio, clear_output, display

## Arguments & User Defined Functions

In [3]:
transcript_path = "../outputs/all_transcripts_v2.csv"
transcripts = pd.read_csv(transcript_path)

# # Only for sample purposes:
# file_path = "142-orig.wav"
# file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores_v2.csv"
bert_scores = pd.read_csv(bert_scores_path)

transcripts = transcripts.merge(bert_scores, on=["file", "line"])
transcripts["line"] = transcripts["line"].astype(str)
data_path = "../outputs/npy/"

batch_size = 32
num_workers = 1

sequence_len = 2048  # np.max(np.load("../outputs/npy/142-orig.wav_shapes.npy"))
write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

# Summary Statistics
First, remove any transcript lines that did not get a matching numpy feature array constructed

In [9]:
npys = [path for path in Path(data_path).rglob("*.npy")]

npys_name = [n.name for n in npys]
npys_name = pd.DataFrame(npys_name, columns=["name"])
sequences = npys_name.loc[~npys_name["name"].str.contains("shape")].copy()
shapes = npys_name.loc[npys_name["name"].str.contains("shape")].copy()

print("Sequence Arrays:", len(sequences))
sequences["splits"] = sequences["name"].apply(lambda x: x.split("_"))
sequences["file"] = sequences["splits"].apply(lambda x: x[0])
sequences["line"] = sequences["splits"].apply(lambda x: x[1].split(".")[0])

transcripts = transcripts.merge(
    sequences[["file", "line"]], on=["file", "line"]
).reset_index(drop=True)

print("Matching Sequences:", len(transcripts))
transcripts.head()

Sequence Arrays: 564
Matching Sequences: 177


Unnamed: 0,file,line,start,end,speaker,speaker_role,word_count,duration,text,gs_score
0,12-1036,22,410.1,434.178,Samuel_A_Alito_Jr,scotus_justice,57,24.078,"The persons could be here, the citizens of Mis...",-0.830128
1,12-1036,24,508.195,523.704,John_G_Roberts_Jr,scotus_justice,46,15.509,What if you have an executor and he's administ...,-1.061946
2,12-1036,26,571.779,614.527,Ruth_Bader_Ginsburg,scotus_justice,77,42.748,"Mr. Massey, you envision one proceeding, it co...",-0.404201
3,12-1036,28,615.464,635.004,Ruth_Bader_Ginsburg,scotus_justice,44,19.54,But now we have the consumers who were affecte...,-1.630183
4,12-1036,30,752.926,776.246,John_G_Roberts_Jr,scotus_justice,92,23.32,Then it would make -- it would make no sense f...,0.598863


In [10]:
try:
    transcripts = transcripts.drop("Unnamed: 0", axis=1)
except:
    pass

Generate summary statistics of arrays

In [11]:
st = time()

all_rows = []
for x in transcripts.iterrows():
    row = dict(x[1])
    clear_output(wait=True)
    print(
        "Item:",
        x[0],
        " |  File:",
        row["file"],
        " |  Line:",
        row["line"],
        " |  Progress:",
        f"{round(100*(x[0]/transcripts.shape[0]))}%",
    )
    data_arr = np.load(osp.join(data_path, f"{row['file']}_{row['line']}.npy"))
    f0_log = data_arr[:, 0]
    row["pitch_log_diff_variance"] = np.nanvar(np.diff(f0_log))
    row["pitch_log_mean"] = np.nanmean(f0_log)
    row["pitch_log_stdev"] = np.nanstd(f0_log)
    (
        row["pitch_log_2pct"],
        row["pitch_log_25pct"],
        row["pitch_log_50pct"],
        row["pitch_log_75pct"],
        row["pitch_log_98pct"],
    ) = np.nanquantile(f0_log, [0.02, 0.25, 0.5, 0.75, 0.98])

    onset_strengths = data_arr[:, 3]
    onset_flags = data_arr[:, 4]
    row["onset_count"] = np.nansum(onset_flags)
    row["onset_str_mean"] = np.nanmean(onset_strengths)
    row["onset_str_stddev"] = np.nanstd(onset_strengths)

    onset_str_histogram = np.histogram(onset_strengths, bins=50, density=True)
    onset_str_data = onset_str_histogram[0]
    onset_str_entropy = -np.sum((onset_str_data * np.log1p(np.abs(onset_str_data))))
    row["onset_str_entropy"] = onset_str_entropy

    onset_times = librosa.onset.onset_detect(
        onset_envelope=onset_strengths, sr=16000, units="time"
    )
    onset_times_diff = np.diff(onset_times)
    row["onset_time_diff_mean"] = np.nanmean(onset_times_diff)
    row["onset_time_diff_stddev"] = np.nanstd(onset_times_diff)

    onset_time_diff_histogram = np.histogram(onset_times_diff, bins=50, density=True)
    onset_time_diff_data = onset_time_diff_histogram[0]
    onset_time_diff_entropy = -np.sum(
        (onset_time_diff_data * np.log1p(np.abs(onset_time_diff_data)))
    )
    row["onset_time_diff_entropy"] = onset_time_diff_entropy

    all_rows.append(row)
print(f"Total Time: {round(time() - st, 2)}s")

Item: 176  |  File: 12-682  |  Line: 203  |  Progress: 99%
Total Time: 8.01s


In [12]:
all_rows_df = pd.DataFrame(all_rows)
all_rows_df["word_rate"] = all_rows_df["word_count"] / all_rows_df["duration"]
all_rows_df["onset_rate"] = all_rows_df["onset_count"] / all_rows_df["duration"]
print(all_rows_df.shape)
all_rows_df.sample(n=5)

(177, 27)


Unnamed: 0,file,line,start,end,speaker,speaker_role,word_count,duration,text,gs_score,...,pitch_log_98pct,onset_count,onset_str_mean,onset_str_stddev,onset_str_entropy,onset_time_diff_mean,onset_time_diff_stddev,onset_time_diff_entropy,word_rate,onset_rate
68,12-138,18,543.499,574.402,John_G_Roberts_Jr,scotus_justice,109,30.903,"Well, that's not true. There are numerous stat...",1.09685,...,8.323026,248.0,0.954409,1.235236,-2.716248,0.161016,0.128292,-150.894411,3.527166,8.025111
135,12-515,245,2887.542,2917.577,Stephen_G_Breyer,scotus_justice,69,30.035,So what the opposition says in your view is ab...,0.237366,...,7.881026,184.0,0.877934,1.382676,-2.436806,0.214493,0.193325,-97.01673,2.29732,6.126186
73,12-138,30,936.529,954.324,John_G_Roberts_Jr,scotus_justice,59,17.795,Nothing about arbitration even in the backgrou...,0.06471,...,8.55636,132.0,0.965244,1.349046,-2.484642,0.175515,0.150943,-100.593257,3.315538,7.417814
149,12-682,33,596.321,618.152,Sonia_Sotomayor,scotus_justice,62,21.831,But I don't see how the argument would be any ...,1.167652,...,8.013026,135.0,0.794371,1.203265,-2.81296,0.200972,0.196425,-99.836006,2.839998,6.183867
175,12-682,201,3632.0,3645.658,Stephen_G_Breyer,scotus_justice,45,13.658,That's the benefit to a minority group. But wh...,0.140065,...,7.76836,104.0,0.929897,1.236713,-3.239477,0.144696,0.106077,-267.891016,3.294772,7.614585


In [13]:
features = [
    "word_count",
    "duration",
    "gs_score",
    "pitch_log_diff_variance",
    "pitch_log_mean",
    "pitch_log_stdev",
    "pitch_log_2pct",
    "pitch_log_25pct",
    "pitch_log_50pct",
    "pitch_log_75pct",
    "pitch_log_98pct",
    "onset_count",
    "onset_str_mean",
    "onset_str_stddev",
    "onset_str_entropy",
    "onset_time_diff_mean",
    "onset_time_diff_stddev",
    "onset_time_diff_entropy",
    "word_rate",
    "onset_rate",
]

justice_level = (
    all_rows_df[features + ["speaker"]].groupby(["speaker"]).agg(["mean", "std"])
)
justice_level.columns = [
    "justice_" + "_".join(x) for x in justice_level.columns.ravel()
]
data_with_speaker = all_rows_df.merge(justice_level, how="left", on="speaker")
print(data_with_speaker.shape)

(177, 67)


  "justice_" + "_".join(x) for x in justice_level.columns.ravel()


In [36]:
justice_year_level = (
    all_rows_df[features + ["speaker", "year"]]
    .groupby(["speaker", "year"])
    .agg(["mean", "std"])
)
justice_year_level.columns = [
    "justice_year_" + "_".join(x) for x in justice_year_level.columns.ravel()
]
data_with_speaker_year = data_with_speaker.merge(
    justice_year_level, how="left", on=["speaker", "year"]
)


print(data_with_speaker_year.shape)

(17110, 110)


  justice_year_level.columns = ["justice_year_"+"_".join(x) for x in justice_year_level.columns.ravel()]


In [37]:
data_with_speaker_year.to_csv("../outputs/summary_data.csv", index=False)