In [1]:
%cd ../

/home/tk/repos/erc


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [2]:
ROOT_DIR = "./multimodal-datasets/"
model_checkpoint = "/home/tk/repos/erc/emoberta-large"
DATASET = "MELD_IEMOCAP"
speaker_mode = None
num_past_utterances = 0
num_future_utterances = 0
SEED = 42

DATASET, speaker_mode, num_past_utterances, num_future_utterances, SEED

('MELD_IEMOCAP', None, 0, 0, 42)

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from utils import get_num_classes, ErcTextDataset, compute_metrics
import numpy as np
from tqdm.notebook import tqdm
import torch
from sklearn.metrics import f1_score

NUM_CLASSES = get_num_classes(DATASET)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=NUM_CLASSES,
)
# model.push_to_hub("emoberta-large", use_temp_dir=True)

In [6]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model.eval()
model.to(device)

ds = {}

ds["train"] = ErcTextDataset(
    DATASET=DATASET,
    SPLIT="train",
    speaker_mode=speaker_mode,
    num_past_utterances=num_past_utterances,
    num_future_utterances=num_future_utterances,
    model_checkpoint="roberta-base",
    ROOT_DIR=ROOT_DIR,
    SEED=SEED,
)

ds["val"] = ErcTextDataset(
    DATASET=DATASET,
    SPLIT="val",
    speaker_mode=speaker_mode,
    num_past_utterances=num_past_utterances,
    num_future_utterances=num_future_utterances,
    model_checkpoint="roberta-base",
    ROOT_DIR=ROOT_DIR,
    SEED=SEED,
)


def get_random_sample(ds, tokenizer, idx=None, max_tokens=512):
    while True:
        if idx is None:
            idx_ = np.random.randint(0, len(ds))
        else:
            idx_ = idx
        random_sample = ds[idx_]
        input_ids, attention_mask, labelid = (
            random_sample["input_ids"],
            random_sample["attention_mask"],
            random_sample["label"],
        )
        break
    #         if len(input_ids) < max_tokens and labelid != 0:
    #             break

    decoded = tokenizer.decode(input_ids)

    input_ids = torch.tensor(input_ids).view(-1, len(input_ids))
    attention_mask = torch.tensor(attention_mask).view(-1, len(attention_mask))
    labelid = torch.tensor(labelid).view(-1, 1)

    return idx_, input_ids, attention_mask, labelid, decoded

2022-03-15 12:39:07.725 INFO utils - _string2tokens: converting utterances into tokens ...
2022-03-15 12:39:07.727 INFO utils - _string2tokens: creating input utterance data ... 
100%|██████████| 1424/1424 [00:06<00:00, 237.23it/s]
2022-03-15 12:39:19.499 INFO utils - _create_input: number of truncated utterances: 0
2022-03-15 12:39:19.510 INFO utils - _string2tokens: converting utterances into tokens ...
2022-03-15 12:39:19.511 INFO utils - _string2tokens: creating input utterance data ... 
100%|██████████| 159/159 [00:00<00:00, 197.55it/s]
2022-03-15 12:39:25.665 INFO utils - _create_input: number of truncated utterances: 0


In [9]:
input_ids.shape

torch.Size([1, 36])

In [7]:
for split in tqdm(["train", "val"]):
    truths = []
    preds = []
    for i in tqdm(range(len(ds[split]))):
        idx, input_ids, attention_mask, labelid, decoded = get_random_sample(
            ds[split], tokenizer, idx=i
        )
        outputs = model(
            **{
                "input_ids": input_ids.to(device),
                "attention_mask": attention_mask.to(device),
            },
            labels=labelid.to(device),
            output_attentions=True,
            output_hidden_states=True,
        )

        truths.append(labelid.detach().cpu().numpy())
        preds.append(torch.softmax(outputs["logits"].detach().cpu(), dim=1).numpy())
        f1_weighted = f1_score(
            [foo.item() for foo in truths],
            [foo.argmax(axis=1).item() for foo in preds],
            average="weighted",
        )

        if i % 1000 == 0:
            print(split, i, f1_weighted)
    print(split, i, f1_weighted)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/18891 [00:00<?, ?it/s]

2022-03-15 12:39:28.444325: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-15 12:39:28.444339: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


train 0 1.0
train 1000 0.8056581891628938
train 2000 0.8112814137451431
train 3000 0.8064255690651801
train 4000 0.8084670423628071
train 5000 0.8135609028550669
train 6000 0.8137845495072008
train 7000 0.8106228334211105
train 8000 0.8103210140370583
train 9000 0.8098046924690303
train 10000 0.8121084242925553
train 11000 0.8108629168698758
train 12000 0.8115574320276991
train 13000 0.8118938309023247
train 14000 0.8133958936097679
train 15000 0.8127758515110489
train 16000 0.813287044925905
train 17000 0.8133496055833564
train 18000 0.8142713641861221
train 18890 0.8149141650021035


  0%|          | 0/2346 [00:00<?, ?it/s]

val 0 1.0
val 1000 0.6440798287108728
val 2000 0.6189108382840852
val 2345 0.6281057228463983


In [None]:
idx, input_ids, attention_mask, labelid, decoded

In [None]:
tokenizer("adsf")["input_ids"]

In [None]:
tokenizer.decode(tokenizer("adsf")["input_ids"])