## Packages

In [2]:
import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa
import re

from IPython.display import Audio, clear_output, display

## Arguments & User Defined Functions

In [6]:
transcript_path = "../outputs/all_transcripts.csv"
transcripts = pd.read_csv(transcript_path)

# # Only for sample purposes:
# file_path = "142-orig.wav"
# file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores.csv"
bert_scores = pd.read_csv(bert_scores_path)

transcripts = transcripts.merge(bert_scores, on=["file", "line"])
transcripts["line"] = transcripts["line"].astype(str)
data_path = "../outputs/npy/"

batch_size = 32
num_workers = 1

sequence_len = 2048  # np.max(np.load("../outputs/npy/142-orig.wav_shapes.npy"))
write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

# Summary Statistics
First, remove any transcript lines that did not get a matching numpy feature array constructed

In [7]:
npys = [path for path in Path(data_path).rglob("*.npy")]

npys_name = [n.name for n in npys]
npys_name = pd.DataFrame(npys_name, columns=["name"])
sequences = npys_name.loc[~npys_name["name"].str.contains("shape")].copy()
shapes = npys_name.loc[npys_name["name"].str.contains("shape")].copy()

print("Sequence Arrays:", len(sequences))
sequences["splits"] = sequences["name"].apply(lambda x: x.split("_"))
sequences["file"] = sequences["splits"].apply(lambda x: x[0])
sequences["line"] = sequences["splits"].apply(lambda x: x[1].split(".")[0])

transcripts = transcripts.merge(
    sequences[["file", "line"]], on=["file", "line"]
).reset_index(drop=True)

print("Matching Sequences:", len(transcripts))
transcripts.head()

Sequence Arrays: 564
Matching Sequences: 564


Unnamed: 0.1,file,line,start,end,speaker,speaker_role,word_count,duration,text,Unnamed: 0,gs_score
0,12-1036,0,0.0,8.951,John_G_Roberts_Jr,scotus_justice,15,8.951,We'll hear argument next today in Case 12-1036...,0,-1.815591
1,12-1036,2,117.344,137.275,Ruth_Bader_Ginsburg,scotus_justice,37,19.931,"Mr. Massey, with respect to that, there is a c...",1,-0.965072
2,12-1036,4,211.401,212.291,Anthony_M_Kennedy,scotus_justice,8,0.89,"And you're asking for $10,000 for each one?",2,-1.092182
3,12-1036,6,213.338,216.259,Anthony_M_Kennedy,scotus_justice,7,2.921,"You're asking for $10,000 for each one.",3,-0.686814
4,12-1036,8,231.831,247.106,Sonia_Sotomayor,scotus_justice,28,15.275,Let's assume a case where -- forget about that...,4,-0.2547


In [8]:
try:
    transcripts = transcripts.drop("Unnamed: 0", axis=1)
except:
    pass

Generate summary statistics of arrays

In [9]:
st = time()

all_rows = []
for x in transcripts.iterrows():
    row = dict(x[1])
    clear_output(wait=True)
    print(
        "Item:",
        x[0],
        " |  File:",
        row["file"],
        " |  Line:",
        row["line"],
        " |  Progress:",
        f"{round(100*(x[0]/transcripts.shape[0]))}%",
    )
    data_arr = np.load(osp.join(data_path, f"{row['file']}_{row['line']}.npy"))
    f0_log = data_arr[:, 0]
    row["pitch_log_diff_variance"] = np.nanvar(np.diff(f0_log))
    row["pitch_log_mean"] = np.nanmean(f0_log)
    row["pitch_log_stdev"] = np.nanstd(f0_log)
    (
        row["pitch_log_2pct"],
        row["pitch_log_25pct"],
        row["pitch_log_50pct"],
        row["pitch_log_75pct"],
        row["pitch_log_98pct"],
    ) = np.nanquantile(f0_log, [0.02, 0.25, 0.5, 0.75, 0.98])

    onset_strengths = data_arr[:, 3]
    onset_flags = data_arr[:, 4]
    row["onset_count"] = np.nansum(onset_flags)
    row["onset_str_mean"] = np.nanmean(onset_strengths)
    row["onset_str_stddev"] = np.nanstd(onset_strengths)

    onset_str_histogram = np.histogram(onset_strengths, bins=50, density=True)
    onset_str_data = onset_str_histogram[0]
    onset_str_entropy = -np.sum((onset_str_data * np.log1p(np.abs(onset_str_data))))
    row["onset_str_entropy"] = onset_str_entropy

    onset_times = librosa.onset.onset_detect(
        onset_envelope=onset_strengths, sr=16000, units="time"
    )
    onset_times_diff = np.diff(onset_times)
    row["onset_time_diff_mean"] = np.nanmean(onset_times_diff)
    row["onset_time_diff_stddev"] = np.nanstd(onset_times_diff)

    onset_time_diff_histogram = np.histogram(onset_times_diff, bins=50, density=True)
    onset_time_diff_data = onset_time_diff_histogram[0]
    onset_time_diff_entropy = -np.sum(
        (onset_time_diff_data * np.log1p(np.abs(onset_time_diff_data)))
    )
    row["onset_time_diff_entropy"] = onset_time_diff_entropy

    all_rows.append(row)
print(f"Total Time: {round(time() - st, 2)}s")

Item: 563  |  File: 12-682  |  Line: 205  |  Progress: 100%
Total Time: 1.57s


In [10]:
all_rows_df = pd.DataFrame(all_rows)
all_rows_df["word_rate"] = all_rows_df["word_count"] / all_rows_df["duration"]
all_rows_df["onset_rate"] = all_rows_df["onset_count"] / all_rows_df["duration"]
print(all_rows_df.shape)
all_rows_df.sample(n=5)

(564, 27)


Unnamed: 0,file,line,start,end,speaker,speaker_role,word_count,duration,text,gs_score,...,pitch_log_98pct,onset_count,onset_str_mean,onset_str_stddev,onset_str_entropy,onset_time_diff_mean,onset_time_diff_stddev,onset_time_diff_entropy,word_rate,onset_rate
341,12-515,72,786.089,811.14,Stephen_G_Breyer,scotus_justice,70,25.051,"Now, what you're asking us to do then, if the ...",0.789211,...,8.64336,159.0,0.963122,1.543098,-2.719005,0.191015,0.191093,-102.14911,2.7943,6.347052
287,12-138,132,3008.159,3024.437,Samuel_A_Alito_Jr,scotus_justice,47,16.278,--You're not answering my question. What is --...,-0.168506,...,7.849693,107.0,0.846907,1.264541,-3.130882,0.186419,0.202985,-111.307034,2.887333,6.573289
455,12-515,292,3558.026,3592.347,Elena_Kagan,scotus_justice,91,34.321,"But I would have thought, General Bursch, that...",0.958797,...,8.652693,212.0,0.921536,1.482304,-2.266394,0.20019,0.178339,-106.446611,2.651438,6.176976
254,12-138,69,1961.889,1987.339,Ruth_Bader_Ginsburg,scotus_justice,73,25.45,"Well, let's say you've done all that. And what...",-1.248571,...,8.373026,175.0,0.776803,1.279077,-3.532439,0.201216,0.19546,-70.395426,2.868369,6.876228
457,12-515,296,3621.849,3646.082,Stephen_G_Breyer,scotus_justice,75,24.233,They didn't participate in the convention and ...,1.171219,...,7.623026,178.0,0.923376,1.345811,-3.269906,0.166,0.14727,-166.978878,3.094953,7.345356


In [11]:
features = [
    "word_count",
    "duration",
    "gs_score",
    "pitch_log_diff_variance",
    "pitch_log_mean",
    "pitch_log_stdev",
    "pitch_log_2pct",
    "pitch_log_25pct",
    "pitch_log_50pct",
    "pitch_log_75pct",
    "pitch_log_98pct",
    "onset_count",
    "onset_str_mean",
    "onset_str_stddev",
    "onset_str_entropy",
    "onset_time_diff_mean",
    "onset_time_diff_stddev",
    "onset_time_diff_entropy",
    "word_rate",
    "onset_rate",
]

justice_level = (
    all_rows_df[features + ["speaker"]].groupby(["speaker"]).agg(["mean", "std"])
)
justice_level.columns = [
    "justice_" + "_".join(x) for x in justice_level.columns.ravel()
]
data_with_speaker = all_rows_df.merge(justice_level, how="left", on="speaker")
print(data_with_speaker.shape)

(564, 67)


  "justice_" + "_".join(x) for x in justice_level.columns.ravel()


In [12]:
justice_year_level = (
    all_rows_df[features + ["speaker", "year"]]
    .groupby(["speaker", "year"])
    .agg(["mean", "std"])
)
justice_year_level.columns = [
    "justice_year_" + "_".join(x) for x in justice_year_level.columns.ravel()
]
data_with_speaker_year = data_with_speaker.merge(
    justice_year_level, how="left", on=["speaker", "year"]
)


print(data_with_speaker_year.shape)

KeyError: "['year'] not in index"

In [37]:
data_with_speaker_year.to_csv("../outputs/summary_data.csv", index=False)