In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import sys

lib_path = os.path.abspath("").replace("notebooks", "src")
sys.path.append(lib_path)

import torch
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.metrics import balanced_accuracy_score, accuracy_score,confusion_matrix, f1_score, precision_score, recall_score
from transformers import BertTokenizer
from data.dataloader import build_train_test_dataset
from tqdm.auto import tqdm
import numpy as np
from models import networks
from transformers import BertTokenizer, RobertaTokenizer


  from .autonotebook import tqdm as notebook_tqdm


## Eval scripts

In [2]:
from collections import Counter
def calculate_accuracy(y_pred, y_true):
    class_weights = {cls: 1.0/count for cls, count in Counter(y_true).items()}
    wa = balanced_accuracy_score(y_true, y_pred, sample_weight=[class_weights[cls] for cls in y_true])
    ua = accuracy_score(y_true, y_pred)
    return ua, wa

In [3]:
def eval(opt, checkpoint_path, all_state_dict=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    network = getattr(networks, opt.model_type)(opt)
    network.to(device)

    # Build dataset
    _, test_ds = build_train_test_dataset(opt)
    weight = torch.load(checkpoint_path, map_location=torch.device(device))
    if all_state_dict:
        weight = weight['state_dict_network']
    else:
        weight = weight.state_dict()
    
    network.load_state_dict(weight)
    network.eval()
    network.to(device)

    y_actu=[]
    y_pred=[]

    for every_test_list in tqdm(test_ds):
        input_ids, audio, label = every_test_list
        input_ids = input_ids.to(device)
        audio = audio.to(device)
        label = label.to(device)
        with torch.no_grad():
            output = network(input_ids,audio)[0]
            _, preds = torch.max(output, 1)
            y_actu.append(label.detach().cpu().numpy()[0])
            y_pred.append(preds.detach().cpu().numpy()[0])
    bacc = balanced_accuracy_score(y_actu, y_pred)
    print("Balanced Accuracy: ", bacc)
    ua, wa = calculate_accuracy(y_actu, y_pred)
    print("Unweighted Accuracy: ", ua)
    print("Weighted Accuracy: ", wa)
    ua_f1 = f1_score(y_actu, y_pred, average='macro')
    # mean_f1 = np.mean(f1_score(y_actu, y_pred, average=None))
    # w_f1 = f1_score(y_actu, y_pred, average='weighted')
    # f1 = f1_score(y_actu, y_pred, average='micro')
    # print("Micro F1: ", f1)
    print("Macro F1: ", ua_f1)
    # print("Weighted F1: ", w_f1)
    # print("Mean F1:", mean_f1)
    # ua_precision = precision_score(y_actu, y_pred, average='macro')
    # w_precision = precision_score(y_actu, y_pred, average='weighted')
    # precision = precision_score(y_actu, y_pred, average='micro')
    # mean_precision = np.mean(precision_score(y_actu, y_pred, average=None))
    # print("Micro Precision: ", precision)
    # print("Macro Precision: ", ua_precision)
    # print("Weighted Precision: ", w_precision)
    # print("Mean precision:", mean_precision)
    # ua_recall = recall_score(y_actu, y_pred, average='macro')
    # w_recall = recall_score(y_actu, y_pred, average='weighted')
    # recall = recall_score(y_actu, y_pred, average='micro')
    # print("Micro Recall: ", recall)
    # print("Macro Recall: ", ua_recall)
    # print("Weighted Recall: ", w_recall)
    
    # cm = confusion_matrix(y_actu, y_pred)
    # print("Confusion Matrix: \n", cm)
    # cmn = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100

    # ax = plt.subplots(figsize=(8, 5.5))[1]
    # sns.heatmap(cmn, cmap='YlOrBr', annot=True, square=True, linecolor='black', linewidths=0.75, ax = ax, fmt = '.2f', annot_kws={'size': 16})
    # ax.set_xlabel('Predicted', fontsize=18, fontweight='bold')
    # ax.xaxis.set_label_position('bottom')
    # ax.xaxis.set_ticklabels(["Anger", "Happiness", "Sadness", "Neutral"], fontsize=16)
    # ax.set_ylabel('Ground Truth', fontsize=18, fontweight='bold')
    # ax.yaxis.set_ticklabels(["Anger", "Happiness", "Sadness", "Neutral"], fontsize=16)
    # plt.tight_layout()
    # # plt.savefig(opt.name + '.png', format='png', dpi=1200)
    # plt.show()

In [4]:
def eval_svm(opt, checkpoint_path, all_state_dict=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    network = getattr(networks, opt.model_type)(opt)
    network.to(device)

    # Build dataset
    train_ds, test_ds = build_train_test_dataset(opt)
    weight = torch.load(checkpoint_path, map_location=torch.device(device))
    if all_state_dict:
        weight = weight['state_dict_network']
    else:
        weight = weight.state_dict()
    
    network.load_state_dict(weight)
    network.eval()
    network.to(device)

    # Get train features
    train_x = []
    train_y = []
    for every_train_list in tqdm(train_ds):
        input_ids, audio, label = every_train_list
        input_ids = input_ids.to(device)
        audio = audio.to(device)
        label = label.to(device)
        with torch.no_grad():
            feature = network(input_ids,audio)[1]
            train_x.append(feature.detach().cpu().numpy()[0])
            train_y.append(label.detach().cpu().numpy()[0])
    
    # SVM
    clf = svm.SVC()
    clf.fit(train_x, train_y)
    
    y_actu=[]
    y_pred=[]

    for every_test_list in tqdm(test_ds):
        input_ids, audio, label = every_test_list
        input_ids = input_ids.to(device)
        audio = audio.to(device)
        label = label.to(device)
        with torch.no_grad():
            feature = network(input_ids,audio)[1]
            preds = clf.predict(feature.detach().cpu().numpy())
            y_actu.append(label.detach().cpu().numpy()[0])
            y_pred.append(preds[0])
    bacc = balanced_accuracy_score(y_actu, y_pred)
    ua, wa = calculate_accuracy(y_actu, y_pred)
    print("Balanced Accuracy: ", bacc)
    print("Unweighted Accuracy: ", ua)
    print("Weighted Accuracy: ", wa)
    
    ua_f1 = f1_score(y_actu, y_pred, average='macro')
    # w_f1 = f1_score(y_actu, y_pred, average='weighted')
    # f1 = f1_score(y_actu, y_pred, average='micro')
    # print("Micro F1: ", f1)
    print("Macro F1: ", ua_f1)
    # print("Weighted F1: ", w_f1)
    # ua_precision = precision_score(y_actu, y_pred, average='macro')
    # w_precision = precision_score(y_actu, y_pred, average='weighted')
    # precision = precision_score(y_actu, y_pred, average='micro')
    # print("Micro Precision: ", precision)
    # print("Macro Precision: ", ua_precision)
    # print("Weighted Precision: ", w_precision)
    # ua_recall = recall_score(y_actu, y_pred, average='macro')
    # w_recall = recall_score(y_actu, y_pred, average='weighted')
    # recall = recall_score(y_actu, y_pred, average='micro')
    # print("Micro Recall: ", recall)
    # print("Macro Recall: ", ua_recall)
    # print("Weighted Recall: ", w_recall)
    
    # cm = confusion_matrix(y_actu, y_pred)
    # print("Confusion Matrix: \n", cm)
    # cmn = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100

    # ax = plt.subplots(figsize=(8, 5.5))[1]
    # sns.heatmap(cmn, cmap='YlOrBr', annot=True, square=True, linecolor='black', linewidths=0.75, ax = ax, fmt = '.2f', annot_kws={'size': 16})
    # ax.set_xlabel('Predicted', fontsize=18, fontweight='bold')
    # ax.xaxis.set_label_position('bottom')
    # ax.xaxis.set_ticklabels(["Anger", "Happiness", "Sadness", "Neutral"], fontsize=16)
    # ax.set_ylabel('Ground Truth', fontsize=18, fontweight='bold')
    # ax.yaxis.set_ticklabels(["Anger", "Happiness", "Sadness", "Neutral"], fontsize=16)
    # plt.tight_layout()
    # plt.savefig(opt.name + '.png', format='png', dpi=1200)
    # plt.show()

## Eval

In [9]:
from configs.base import Config
checkpoint_path = "/home/kuhaku/Code/EmotionClassification/code/3m-ser-private/scripts/checkpoints_drive/3M-SER_v2_roberta_wav2vec2_losses/FocalLoss_cls/20230910-235436"
opt_path = os.path.join(checkpoint_path,"opt.log")
ckpt_path = os.path.join(checkpoint_path,"weights/best_acc/checkpoint_0_0.pt")

opt = Config()

opt.load(opt_path)
# Set dataset path
opt.data_root="/home/kuhaku/Code/EmotionClassification/code/3m-ser-private/notebooks/data/IEMOCAP/"

eval(opt, ckpt_path)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 933/933 [01:01<00:00, 15.27it/s]

Balanced Accuracy:  0.8083373125339526
Unweighted Accuracy:  0.8027867095391211
Weighted Accuracy:  0.8106921260324083
Macro F1:  0.8089718416956387





In [6]:
from configs.base import Config
checkpoint_path = "/home/kuhaku/Code/EmotionClassification/code/3m-ser-private/scripts/checkpoints_drive/3M-SER_v2_roberta_wav2vec2_losses/ContrastiveCenterLossSER_cls/20230910-235648"
opt_path = os.path.join(checkpoint_path,"opt.log")
ckpt_path = os.path.join(checkpoint_path,"weights/best_acc/checkpoint_0_0.pt")

opt = Config()

opt.load(opt_path)
# Set dataset path
opt.data_root="/home/kuhaku/Code/EmotionClassification/code/3m-ser-private/notebooks/data/IEMOCAP/"

eval_svm(opt, ckpt_path)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 3728/3728 [03:51<00:00, 16.13it/s]
100%|██████████| 933/933 [00:59<00:00, 15.76it/s]


Balanced Accuracy:  0.7884813166528788
Unweighted Accuracy:  0.782422293676313
Weighted Accuracy:  0.7892457206846899
Macro F1:  0.7887573989632025
