## Packages

In [None]:
import os.path as osp
from pathlib import Path
from time import time
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import librosa

from IPython.display import Audio, clear_output, display

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

## Arguments

In [None]:
bert_model = "bert-base-uncased"
pretrained_model = "../models/bert_pretrained/"
transcripts_path = "../outputs/data_transcripts_v2.csv"
device = "cpu"

## User-Defined Functions

In [None]:
class GrandStand_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, info):
        self.encodings = encodings
        self.labels = labels
        self.info = info

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        item["info"] = self.info[idx]
        return item

    def __len__(self):
        return len(self.labels)

## Load Transcript

In [None]:
transcripts = pd.read_csv(transcripts_path)
transcripts.sample(n=5)

In [None]:
print("# Cases:", transcripts["file"].nunique())
print("# Speech Segments:", transcripts.shape[0])

In [None]:
text = transcripts["text"].tolist()
metadata = transcripts.apply(lambda x: (x["file"], x["line"]), axis=1).tolist()
fake_labels = [0 for i in range(transcripts.shape[0])]

## Load BERT Pre-Trained Weights

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model)

In [None]:
st = time()

scotus_encoder = tokenizer(text, truncation=True, padding=True)
scotus_dataset = GrandStand_Dataset(scotus_encoder, fake_labels, metadata)

print(f"{round(time() - st, 2)}s")

In [None]:
st = time()
model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=1)

model.load_state_dict(torch.load("../models/bert_pretrained/BERTforGS"))
model.eval()
model.to(device)

print(f"{round(time() - st, 2)}s")

## Generate Scores

In [None]:
st = time()

inputs = []
grandstanding_scores = []
for i, batch in enumerate(scotus_dataset):
    clear_output(wait=True)
    print("Item", i)
    print(f"Elapsed Time: {round(time() - st, 2)}s")
    with torch.no_grad():
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        file_info = batch["info"]
        outputs = model(
            input_ids.reshape(1, -1), attention_mask=attention_mask.reshape(1, -1)
        )
        score = outputs.logits[0].item()
        grandstanding_scores.append((file_info, score))

print(f"Total Time: {round(time() - st, 2)}s")

In [None]:
gs_df = pd.DataFrame(
    data=[(i[0], i[1], s) for (i, s) in grandstanding_scores],
    columns=["file", "line", "gs_score"],
)
gs_df.sample(n=5)

In [None]:
_ = gs_df["gs_score"].hist()

In [None]:
gs_df.to_csv("../outputs/bert_scores.csv")