## Packages

In [1]:
import os
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt
import math

import numpy as np
import pandas as pd
import librosa
import re
import scipy

from IPython.display import Audio, clear_output, display

## Arguments & User Defined Functions

In [2]:
transcript_path = "../outputs/data_transcripts_v2.csv"
transcripts = pd.read_csv(transcript_path)

# # Only for sample purposes:
# file_path = "142-orig.wav"
# file_transcripts = transcripts.loc[transcripts["file"] == file_path]

bert_scores_path = "../outputs/bert_scores_v2.csv"
bert_scores = pd.read_csv(bert_scores_path)

transcripts = transcripts.merge(bert_scores, on=["file", "line"])
transcripts["line"] = transcripts["line"].astype(str)
try:
    transcripts = transcripts.drop("Unnamed: 0", axis=1)
except:
    pass

data_path = "../outputs/npy2"

case_summaries = pd.read_json("../outputs/case_summaries.json")

batch_size = 32
num_workers = 1

sequence_len = 2048  # np.max(np.load("../outputs/npy/142-orig.wav_shapes.npy"))
write_dir = "../outputs/splits/"
if not osp.exists(write_dir):
    os.makedirs(write_dir)

In [3]:
npys = [path for path in Path(data_path).rglob("*.npy")]

npys_name = [n.name for n in npys]
npys_name = pd.DataFrame(npys_name, columns=["name"])
sequences = npys_name.loc[~npys_name["name"].str.contains("shape")].copy()
shapes = npys_name.loc[npys_name["name"].str.contains("shape")].copy()

print("Sequence Arrays:", len(sequences))
sequences["splits"] = sequences["name"].apply(lambda x: x.split("_"))
sequences["file"] = sequences["splits"].apply(lambda x: x[0])
sequences["line"] = sequences["splits"].apply(lambda x: x[1].split(".")[0])

transcripts = transcripts.merge(
    sequences[["file", "line"]], on=["file", "line"]
).reset_index(drop=True)

print("Matching Sequences:", len(transcripts))

transcripts = transcripts.merge(
    case_summaries[["docket_number", "term"]],
    left_on="file",
    right_on="docket_number",
    how="left",
)

transcripts.head()

Sequence Arrays: 17110
Matching Sequences: 17110


Unnamed: 0,file,line,start,end,speaker,speaker_role,word_count,duration,text,start_idx,end_idx,gs_score,docket_number,term
0,11-681,4,62.906,82.218,Ruth_Bader_Ginsburg,scotus_justice,45,19.312,But how does it differ from the typical bargai...,1006496,1315488,-1.173626,11-681,2013
1,11-681,6,99.497,119.41,Sonia_Sotomayor,scotus_justice,40,19.913,Is your argument dependent on this being sort ...,1591952,1910560,-0.831369,11-681,2013
2,11-681,16,201.764,227.298,Antonin_Scalia,scotus_justice,68,25.534,Suppose you have a policeman who -- who is dis...,3228224,3636768,0.740117,11-681,2013
3,11-681,22,273.827,286.853,Antonin_Scalia,scotus_justice,50,13.026,It seems to me it's always a matter of public ...,4381232,4589648,0.326807,11-681,2013
4,11-681,32,350.343,397.472,Elena_Kagan,scotus_justice,130,47.129,"But what -- what you're objecting to, to the e...",5605488,6359552,0.494308,11-681,2013


In [79]:
speakers = transcripts["speaker"].unique()
seg_infos = []
speaker_infos = []
spkr_yr_infos = []

tracker = 0
for s in speakers:
    s_transcripts = transcripts.loc[transcripts["speaker"] == s]
    speaker_arrays = []
    years = s_transcripts["term"].unique()

    for y in years:
        s_y_transcripts = s_transcripts.loc[transcripts["term"] == y]
        speaker_year_arrays = []

        for r in s_y_transcripts.iterrows():
            tracker += 1
            row = r[1]

            clear_output(wait=True)
            print(
                "Item:",
                tracker,
                " |  Progress:",
                f"{round(100*(tracker/transcripts.shape[0]))}%",
                " |  File:",
                row["file"],
                " |  Line:",
                row["line"],
            )

            # Load Data
            seg_arr = np.load(osp.join(data_path, f"{row['file']}_{row['line']}.npy"))
            speaker_year_arrays.append(seg_arr)
            speaker_arrays.append(seg_arr)

            # Segment Level Summary
            seg_means = np.nanmean(seg_arr, axis=0)
            seg_stds = np.nanstd(seg_arr, axis=0)
            seg_info = {
                "file": row["file"],
                "line": row["line"],
                "speaker": s,
                "year": y,
            }
            seg_info.update(
                dict(
                    zip(
                        [
                            "seg_f0_mean",
                            "seg_voiced_flag_mean",
                            "seg_voiced_prob_mean",
                            "seg_onset_strength_mean",
                            "seg_onset_flag_mean",
                        ],
                        seg_means,
                    )
                )
            )
            seg_info.update(
                dict(
                    zip(
                        [
                            "seg_f0_std",
                            "seg_voiced_flag_std",
                            "seg_voiced_prob_std",
                            "seg_onset_strength_std",
                            "seg_onset_flag_std",
                        ],
                        seg_stds,
                    )
                )
            )
            seg_infos.append(seg_info)

        # Speaker-Year Level Summary
        spkr_yr_arr = np.concatenate(speaker_year_arrays, axis=0)
        spkr_yr_means = np.nanmean(spkr_yr_arr, axis=0)
        spkr_yr_stds = np.nanstd(spkr_yr_arr, axis=0)
        spkr_yr_info = {"speaker": s, "year": y}
        spkr_yr_info.update(
            dict(
                zip(
                    [
                        "spkr_yr_f0_mean",
                        "spkr_yr_voiced_flag_mean",
                        "spkr_yr_voiced_prob_mean",
                        "spkr_yr_onset_strength_mean",
                        "spkr_yr_onset_flag_mean",
                    ],
                    spkr_yr_means,
                )
            )
        )
        spkr_yr_info.update(
            dict(
                zip(
                    [
                        "spkr_yr_f0_std",
                        "spkr_yr_voiced_flag_std",
                        "spkr_yr_voiced_prob_std",
                        "spkr_yr_onset_strength_std",
                        "spkr_yr_onset_flag_std",
                    ],
                    spkr_yr_stds,
                )
            )
        )
        spkr_yr_infos.append(spkr_yr_info)

    # Speaker Level Summary
    speaker_arr = np.concatenate(speaker_arrays, axis=0)
    speaker_means = np.nanmean(speaker_arr, axis=0)
    speaker_stds = np.nanstd(speaker_arr, axis=0)
    speaker_info = {"speaker": s}
    speaker_info.update(
        dict(
            zip(
                [
                    "speaker_f0_mean",
                    "speaker_voiced_flag_mean",
                    "speaker_voiced_prob_mean",
                    "speaker_onset_strength_mean",
                    "speaker_onset_flag_mean",
                ],
                speaker_means,
            )
        )
    )
    speaker_info.update(
        dict(
            zip(
                [
                    "speaker_f0_std",
                    "speaker_voiced_flag_std",
                    "speaker_voiced_prob_std",
                    "speaker_onset_strength_std",
                    "speaker_onset_flag_std",
                ],
                speaker_stds,
            )
        )
    )
    speaker_infos.append(speaker_info)

Item: 17147  |  File: 20-5904  |  Line: 254  |  Progress: 100%


In [80]:
speaker_df = pd.DataFrame(speaker_infos)
speaker_year_df = pd.DataFrame(spkr_yr_infos)
segment_df = pd.DataFrame(seg_infos)
segment_df.shape

(17147, 14)

In [81]:
segment_df = segment_df.merge(speaker_year_df, on=["speaker", "year"]).merge(
    speaker_df, on="speaker"
)
segment_df.shape

(17147, 34)

In [82]:
segment_df.to_csv("../outputs/nn_summary_info.csv", index=False)

## Find best onset max_size

In [2]:
ms = list(range(1, 50))

In [3]:
data = pd.read_csv("../../capstone/capstone/yagnesh/full_data.csv")
wavs_dir =  "../../capstone/capstone/yagnesh/wavs/"
smpl = data.sample(n=100).reset_index(drop=True).copy()

In [7]:
ms_info = []
for m in ms:
    all_onset_counts = []
    for x in smpl.iterrows():
        clear_output(wait=True)
        print(
                "Onset Max Size: ",
                m,
                " |  File:",
                x[0],
                " |  Progress:",
                f"{round(100*(x[0]/smpl.shape[0]))}%",

        )
        row = x[1]
        try:
            wav_file, wav_sr = librosa.load(
                path=osp.join(wavs_dir, f"{row['file']}"),
                sr=16000,
            )
            start_idx = row["start_idx"]
            end_idx = row["end_idx"]
            clip = wav_file[start_idx:end_idx]

            o_env = librosa.onset.onset_strength(clip, sr=wav_sr, max_size=m)
            times = librosa.frames_to_time(np.arange(len(o_env)), sr=wav_sr)
            onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=wav_sr)
            all_onset_counts.append(len(onset_frames))
        except:
            all_onset_counts.append(np.nan)
    corr, p_val = scipy.stats.pearsonr(smpl["word_count"].values, np.array(all_onset_counts))
    ms_info.append({"m": m, "corr": corr, "p_val": p_val})

Onset Max Size:  49  |  File: 99  |  Progress: 99%


In [10]:
pd.DataFrame(ms_info).sort_values("corr", ascending=False)

Unnamed: 0,m,corr,p_val
0,1,0.953508,7.53451e-53
1,2,0.941445,4.5675409999999995e-48
2,3,0.929549,2.951503e-44
3,4,0.914605,2.541687e-40
4,5,0.902328,1.357245e-37
5,6,0.899405,5.355646e-37
6,7,0.88521,2.425877e-34
7,8,0.874914,1.262367e-32
8,9,0.867369,1.8430340000000001e-31
9,10,0.857138,5.431397e-30


In [25]:
row = dict(data.iloc[0])
wav_file, wav_sr = librosa.load(
    path=osp.join(wavs_dir, f"{row['file']}"),
    sr=16000,
)

start_idx = row["start_idx"]
end_idx = row["end_idx"]
clip = wav_file[start_idx:end_idx]

o_env = librosa.onset.onset_strength(clip, sr=wav_sr, max_size=10)
times = librosa.frames_to_time(np.arange(len(o_env)), sr=wav_sr)
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=wav_sr)



In [15]:
onset_frames.shape[0]

70

In [None]:
clear_output(wait=True)
            print(
                "Item:",
                tracker,
                " |  Progress:",
                f"{round(100*(tracker/transcripts.shape[0]))}%",
                " |  File:",
                row["file"],
                " |  Line:",
                row["line"],
            )