## Packages

In [1]:
import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa
import re

from IPython.display import Audio, clear_output, display

## Arguments & User Defined Functions

In [4]:
transcript_path = "../outputs/data_transcripts_v2.csv"
transcripts = pd.read_csv(transcript_path)

# # Only for sample purposes:
# file_path = "142-orig.wav"
# file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores_v2.csv"
bert_scores = pd.read_csv(bert_scores_path)

transcripts = transcripts.merge(bert_scores, on=["file", "line"])
transcripts["line"] = transcripts["line"].astype(str)
data_path = "../outputs/npy2"

batch_size = 32
num_workers = 1

sequence_len = 2048  # np.max(np.load("../outputs/npy/142-orig.wav_shapes.npy"))
write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

In [None]:
# http://timgolden.me.uk/python/win32_how_do_i/get-document-summary-info.html
import os, sys
import pythoncom
from win32com.shell import shell
from win32com import storagecon

FORMATS = {
    pythoncom.FMTID_SummaryInformation: "SummaryInformation",
    pythoncom.FMTID_DocSummaryInformation: "DocSummaryInformation",
    pythoncom.FMTID_UserDefinedProperties: "UserDefinedProperties",
}
PROPERTIES = {
    pythoncom.FMTID_SummaryInformation: dict(
        (getattr(storagecon, d), d) for d in dir(storagecon) if d.startswith("PIDSI_")
    ),
    pythoncom.FMTID_DocSummaryInformation: dict(
        (getattr(storagecon, d), d) for d in dir(storagecon) if d.startswith("PIDDSI_")
    ),
}

STORAGE_READ = storagecon.STGM_READ | storagecon.STGM_SHARE_EXCLUSIVE


def property_dict(property_set_storage, fmtid):
    properties = {}
    try:
        property_storage = property_set_storage.Open(fmtid, STORAGE_READ)
    except pythoncom.com_error as error:
        if error.strerror == "STG_E_FILENOTFOUND":
            return {}
        else:
            raise

    for name, property_id, vartype in property_storage:
        if name is None:
            name = PROPERTIES.get(fmtid, {}).get(property_id, None)
        if name is None:
            name = hex(property_id)
        try:
            for value in property_storage.ReadMultiple([property_id]):
                properties[name] = value
        #
        # There are certain values we can't read; they
        # raise type errors from within the pythoncom
        # implementation, thumbnail
        #
        except TypeError:
            properties[name] = None
    return properties


def property_sets(filepath):
    pidl, flags = shell.SHILCreateFromPath(os.path.abspath(filepath), 0)
    property_set_storage = shell.SHGetDesktopFolder().BindToStorage(
        pidl, None, pythoncom.IID_IPropertySetStorage
    )
    for fmtid, clsid, flags, ctime, mtime, atime in property_set_storage:
        yield FORMATS.get(fmtid, str(fmtid)), property_dict(property_set_storage, fmtid)
        if fmtid == pythoncom.FMTID_DocSummaryInformation:
            fmtid = pythoncom.FMTID_UserDefinedProperties
            user_defined_properties = property_dict(property_set_storage, fmtid)
            if user_defined_properties:
                yield FORMATS.get(fmtid, str(fmtid)), user_defined_properties

In [5]:
npys = [path for path in Path(data_path).rglob("*.npy")]

npys_name = [n.name for n in npys]
npys_name = pd.DataFrame(npys_name, columns=["name"])
sequences = npys_name.loc[~npys_name["name"].str.contains("shape")].copy()
shapes = npys_name.loc[npys_name["name"].str.contains("shape")].copy()

print("Sequence Arrays:", len(sequences))
sequences["splits"] = sequences["name"].apply(lambda x: x.split("_"))
sequences["file"] = sequences["splits"].apply(lambda x: x[0])
sequences["line"] = sequences["splits"].apply(lambda x: x[1].split(".")[0])

transcripts = transcripts.merge(
    sequences[["file", "line"]], on=["file", "line"]
).reset_index(drop=True)

print("Matching Sequences:", len(transcripts))
transcripts.head()

Sequence Arrays: 17110
Matching Sequences: 17110


Unnamed: 0.1,file,line,start,end,speaker,speaker_role,word_count,duration,text,start_idx,end_idx,Unnamed: 0,gs_score
0,11-681,4,62.906,82.218,Ruth_Bader_Ginsburg,scotus_justice,45,19.312,But how does it differ from the typical bargai...,1006496,1315488,0,-1.173626
1,11-681,6,99.497,119.41,Sonia_Sotomayor,scotus_justice,40,19.913,Is your argument dependent on this being sort ...,1591952,1910560,1,-0.831369
2,11-681,16,201.764,227.298,Antonin_Scalia,scotus_justice,68,25.534,Suppose you have a policeman who -- who is dis...,3228224,3636768,2,0.740117
3,11-681,22,273.827,286.853,Antonin_Scalia,scotus_justice,50,13.026,It seems to me it's always a matter of public ...,4381232,4589648,3,0.326807
4,11-681,32,350.343,397.472,Elena_Kagan,scotus_justice,130,47.129,"But what -- what you're objecting to, to the e...",5605488,6359552,4,0.494308


In [6]:
try:
    transcripts = transcripts.drop("Unnamed: 0", axis=1)
except:
    pass

In [7]:
st = time()

all_rows = []
for x in transcripts.iterrows():
    row = dict(x[1])
    clear_output(wait=True)
    print(
        "Item:",
        x[0],
        " |  File:",
        row["file"],
        " |  Line:",
        row["line"],
        " |  Progress:",
        f"{round(100*(x[0]/transcripts.shape[0]))}%",
    )
    data_arr = np.load(osp.join(data_path, f"{row['file']}_{row['line']}.npy"))
    f0_log = data_arr[:, 0]
    row["pitch_log_diff_variance"] = np.nanvar(np.diff(f0_log))
    row["pitch_log_mean"] = np.nanmean(f0_log)
    row["pitch_log_stdev"] = np.nanstd(f0_log)
    (
        row["pitch_log_2pct"],
        row["pitch_log_25pct"],
        row["pitch_log_50pct"],
        row["pitch_log_75pct"],
        row["pitch_log_98pct"],
    ) = np.nanquantile(f0_log, [0.02, 0.25, 0.5, 0.75, 0.98])

    onset_strengths = data_arr[:, 3]
    onset_flags = data_arr[:, 4]
    row["onset_count"] = np.nansum(onset_flags)
    row["onset_str_mean"] = np.nanmean(onset_strengths)
    row["onset_str_stddev"] = np.nanstd(onset_strengths)

    onset_str_histogram = np.histogram(onset_strengths, bins=50, density=True)
    onset_str_data = onset_str_histogram[0]
    onset_str_entropy = -np.sum((onset_str_data * np.log1p(np.abs(onset_str_data))))
    row["onset_str_entropy"] = onset_str_entropy

    onset_times = librosa.onset.onset_detect(
        onset_envelope=onset_strengths, sr=16000, units="time"
    )
    onset_times_diff = np.diff(onset_times)
    row["onset_time_diff_mean"] = np.nanmean(onset_times_diff)
    row["onset_time_diff_stddev"] = np.nanstd(onset_times_diff)

    onset_time_diff_histogram = np.histogram(onset_times_diff, bins=50, density=True)
    onset_time_diff_data = onset_time_diff_histogram[0]
    onset_time_diff_entropy = -np.sum(
        (onset_time_diff_data * np.log1p(np.abs(onset_time_diff_data)))
    )
    row["onset_time_diff_entropy"] = onset_time_diff_entropy

    all_rows.append(row)
print(f"Total Time: {round(time() - st, 2)}s")

Item: 17109  |  File: 65-orig  |  Line: 197  |  Progress: 100%
Total Time: 46.84s


In [13]:
all_rows_df = pd.DataFrame(all_rows)
all_rows_df["word_rate"] = all_rows_df["word_count"] / all_rows_df["duration"]
all_rows_df["onset_rate"] = all_rows_df["onset_count"] / all_rows_df["duration"]
print(all_rows_df.shape)
all_rows_df.sample(n=5)

(17110, 29)


Unnamed: 0,file,line,start,end,speaker,speaker_role,word_count,duration,text,start_idx,...,pitch_log_98pct,onset_count,onset_str_mean,onset_str_stddev,onset_str_entropy,onset_time_diff_mean,onset_time_diff_stddev,onset_time_diff_entropy,word_rate,onset_rate
5446,14-86,9,149.43,159.495,Anthony_M_Kennedy,scotus_justice,40,10.065,Your position -- your statement that you opene...,2390880,...,8.13136,76.0,0.767857,1.209935,-3.282789,0.19456,0.178609,-119.344645,3.974168,7.550919
4844,14-232,168,2274.98,2324.09,Stephen_G_Breyer,scotus_justice,142,49.11,Look at the finding to support what the -- the...,36399680,...,8.559026,316.0,0.774101,1.089605,-3.268223,0.189992,0.177536,-94.703007,2.891468,6.434535
15597,19-547,156,3458.31,3473.1,Neil_Gorsuch,scotus_justice,42,14.79,"Well, I guess I'm more -- I'm asking don't -- ...",55332960,...,5.153833,78.0,0.838988,1.570716,-3.459425,0.205913,0.213552,-79.41477,2.839757,5.273834
15450,19-511,264,4463.39,4480.83,Amy_Coney_Barrett,scotus_justice,59,17.44,"Mr. Garner, you've talked in a number of these...",71414240,...,8.740526,104.0,0.806373,1.273945,-2.717792,0.217766,0.195775,-83.298015,3.383028,5.963303
9555,16-6219,204,3147.4,3196.895,Elena_Kagan,scotus_justice,121,49.495,Right. Okay. I thought that his point was diff...,50358400,...,8.44036,302.0,0.96647,1.509959,-2.45367,0.203916,0.196563,-81.223053,2.444691,6.101626


In [None]:
mp3s = [path for path in Path("../mp3s/").rglob("*.mp3")]

terms = []
bads = 0
for i, m in enumerate(mp3s):
    clear_output(wait=True)
    print(
        "Item:",
        i,
        " |  File:",
        m.name,
        " |  Progress:",
        f"{round(100*(i/len(mp3s)))}%",
    )
    results = []
    for name, properties in property_sets(m):
        results.append((name, properties))
    try:
        term = re.findall(r"20[0-9]{2} Term", str(results))[0][:4]
        terms.append({"file": m.name.replace(".mp3", ""), "year": term})
    except:
        bads += 1
    print({"file": m.name.replace(".mp3", ""), "year": term})
print(bads)

terms_df = pd.DataFrame(terms)
terms_df

all_rows_df = all_rows_df.merge(terms_df, on="file", how="left")

In [35]:
features = [
    "word_count",
    "duration",
    "gs_score",
    "pitch_log_diff_variance",
    "pitch_log_mean",
    "pitch_log_stdev",
    "pitch_log_2pct",
    "pitch_log_25pct",
    "pitch_log_50pct",
    "pitch_log_75pct",
    "pitch_log_98pct",
    "onset_count",
    "onset_str_mean",
    "onset_str_stddev",
    "onset_str_entropy",
    "onset_time_diff_mean",
    "onset_time_diff_stddev",
    "onset_time_diff_entropy",
    "word_rate",
    "onset_rate",
]

justice_level = (
    all_rows_df[features + ["speaker"]].groupby(["speaker"]).agg(["mean", "std"])
)
justice_level.columns = [
    "justice_" + "_".join(x) for x in justice_level.columns.ravel()
]
data_with_speaker = all_rows_df.merge(justice_level, how="left", on="speaker")
print(data_with_speaker.shape)

(17110, 70)


  justice_level.columns = ["justice_"+"_".join(x) for x in justice_level.columns.ravel()]


Item: 487  |  File: 65-orig.mp3  |  Progress: 100%
{'file': '65-orig', 'year': '2020'}
1


In [28]:
terms_df = pd.DataFrame(terms)
terms_df

Unnamed: 0,file,year
0,11-681,2013
1,11-965,2013
2,12-1036,2013
3,12-1038,2013
4,12-10882,2013
...,...,...
482,20-472,2020
483,20-512,2020
484,20-543,2020
485,20-5904,2020


In [36]:
justice_year_level = (
    all_rows_df[features + ["speaker", "year"]]
    .groupby(["speaker", "year"])
    .agg(["mean", "std"])
)
justice_year_level.columns = [
    "justice_year_" + "_".join(x) for x in justice_year_level.columns.ravel()
]
data_with_speaker_year = data_with_speaker.merge(
    justice_year_level, how="left", on=["speaker", "year"]
)


print(data_with_speaker_year.shape)

(17110, 110)


  justice_year_level.columns = ["justice_year_"+"_".join(x) for x in justice_year_level.columns.ravel()]


In [37]:
data_with_speaker_year.to_csv("../outputs/summary_data.csv", index=False)