In [None]:
%cd /data/codes/prep_ps_pykaldi/
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    ConfusionMatrixDisplay
)
import pandas as pd
import numpy as np
import pickle
import json
import re

In [None]:
def convert_score_to_color(score, YELLOW_GREEN=85/50, RED_YELLOW=30/50):
    LABEL2ID = {"GREEN": 0, "YELLOW": 1, "RED":2}
    red_index = score < RED_YELLOW
    yellow_index = ((score >= RED_YELLOW).int() & (score < YELLOW_GREEN).int()).bool()
    green_index = score >= YELLOW_GREEN

    score[red_index] = LABEL2ID["RED"]
    score[yellow_index] = LABEL2ID["YELLOW"]
    score[green_index] = LABEL2ID["GREEN"]

    return score

def load_data(data_dir):
    phone_ids = np.load(f'{data_dir}/phone_ids.npy')
    phone_scores = np.load(f'{data_dir}/phone_scores.npy')
    durations = np.load(f'{data_dir}/duration.npy')
    gops = np.load(f'{data_dir}/gop.npy')
    wavlm_features = np.load(f'{data_dir}/wavlm_features.npy')

    return phone_ids, phone_scores, durations, gops, wavlm_features


In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class PrepDataset(Dataset):
    def __init__(self, phone_ids, phone_scores, durations, gops, wavlm_features):
        self.phone_ids = phone_ids
        self.phone_scores = phone_scores
        self.gops = gops
        self.durations = durations
        self.wavlm_features = wavlm_features
        
    def __len__(self):
        return self.phone_ids.shape[0]
    
    def parse_data(self, phone_ids, phone_scores, gops, durations, wavlm_features):
        phone_ids = torch.tensor(phone_ids)
        durations = torch.tensor(durations)
        gops = torch.tensor(gops)
        phone_scores = torch.tensor(phone_scores).float().clone()
        wavlm_features = torch.tensor(wavlm_features)

        phone_scores[phone_scores != -1] /= 50

        features = torch.concat([gops, durations.unsqueeze(-1), wavlm_features], dim=-1)        
        return {
            "features": features,
            "phone_ids": phone_ids,
            "phone_scores":phone_scores
        }
        
    def __getitem__(self, index):
        phone_ids = self.phone_ids[index]
        phone_scores = self.phone_scores[index]
        gops = self.gops[index]
        durations = self.durations[index]
        wavlm_features = self.wavlm_features[index]

        return self.parse_data(
            phone_ids=phone_ids,
            phone_scores=phone_scores,
            gops=gops,
            durations=durations,
            wavlm_features=wavlm_features
        )

data_dir = "/data/codes/prep_ps_pykaldi/exp/sm/in_short"

phone_ids, phone_scores, durations, gops, wavlm_features = load_data(data_dir)
dataset_v1 = PrepDataset(phone_ids, phone_scores, durations, gops, wavlm_features)
dataloader = DataLoader(dataset_v1, batch_size=8)

for batch in dataloader:
    features = batch["features"]
    phone_ids = batch["phone_ids"]
    phone_scores = batch["phone_scores"]
    
    print(features.shape)
    print(phone_ids.shape)
    print(phone_scores.shape)
    break

dataset_v1 = None
dataloader = None

In [None]:
import math
import warnings
import torch
import torch.nn as nn
import numpy as np

def get_sinusoid_encoding(n_position, d_hid):
    def get_position_angle_vec(position):
        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

    return torch.FloatTensor(sinusoid_table).unsqueeze(0)


def _no_grad_trunc_normal_(tensor, mean, std, a, b):
    def norm_cdf(x):
        return (1. + math.erf(x / math.sqrt(2.))) / 2.

    if (mean < a - 2 * std) or (mean > b + 2 * std):
        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
                      "The distribution of values may be incorrect.",
                      stacklevel=2)

    with torch.no_grad():
        l = norm_cdf((a - mean) / std)
        u = norm_cdf((b - mean) / std)

        tensor.uniform_(2 * l - 1, 2 * u - 1)
        tensor.erfinv_()
        tensor.mul_(std * math.sqrt(2.))
        tensor.add_(mean)
        tensor.clamp_(min=a, max=b)
        
        return tensor

def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
    return _no_grad_trunc_normal_(tensor, mean, std, a, b)

class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)

        self.drop_path = nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

class GOPT(nn.Module):
    def __init__(self, embed_dim, num_heads, depth, input_dim=84, max_length=50, num_phone=40):
        super().__init__()
        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.blocks = nn.ModuleList(
            [
                Block(dim=embed_dim, num_heads=num_heads) 
                for i in range(depth)
                ]
            )

        self.pos_embed = nn.Parameter(torch.zeros(1, max_length+1, self.embed_dim))
        trunc_normal_(self.pos_embed, std=.02)

        self.in_proj = nn.Linear(self.input_dim, embed_dim)
        self.linear = nn.Linear(embed_dim * 2, embed_dim)
        self.mlp_head_phn = nn.Sequential(
            nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 1))

        self.mlp_head_word= nn.Sequential(
            nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 1))

        self.num_phone = num_phone
        self.phn_proj = nn.Linear(num_phone, embed_dim)

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.mlp_head_utt = nn.Sequential(nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 1))

        trunc_normal_(self.cls_token, std=.02)

    def forward(self, x, phn):
        B = x.shape[0]
        phn_one_hot = torch.nn.functional.one_hot(phn.long()+1, num_classes=self.num_phone).float()
        phn_embed = self.phn_proj(phn_one_hot)

        if self.embed_dim != self.input_dim:
            x = self.in_proj(x)

        x = torch.cat([x, phn_embed], dim=-1)
        x = self.linear(x)
        
        cls_token = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_token, x), dim=1)
        x = x + self.pos_embed[:,:x.shape[1],:]

        for blk in self.blocks:
            x = blk(x)
        u = self.mlp_head_utt(x[:, 0])
        p = self.mlp_head_phn(x[:, 1:])
        w = self.mlp_head_word(x[:, 1:])
        return u, p, w

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

data_dir = "/data/codes/prep_ps_pykaldi/exp/sm/train"
phone_ids, phone_scores, durations, gops, wavlm_features = load_data(data_dir)
trainset = PrepDataset(
    phone_ids, 
    phone_scores, 
    durations, 
    gops, 
    wavlm_features
)

trainloader = DataLoader(trainset, batch_size=8, shuffle=True, drop_last=False)

data_dir = "/data/codes/prep_ps_pykaldi/exp/sm/in_short"
phone_ids, phone_scores, durations, gops, wavlm_features = load_data(data_dir)
testset = PrepDataset(
    phone_ids, 
    phone_scores, 
    durations, 
    gops, 
    wavlm_features
)

testloader = DataLoader(testset, batch_size=8, shuffle=False, drop_last=False)


In [None]:
from torch.optim.lr_scheduler import MultiStepLR
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gopt_model = GOPT(
    embed_dim=32, num_heads=1, 
    depth=3, input_dim=851, 
    max_length=128, num_phone=62).to(device)

trainables = [p for p in gopt_model.parameters() if p.requires_grad]

lr = 3e-4
optimizer = torch.optim.Adam(
    trainables, lr, weight_decay=5e-7, betas=(0.95, 0.999))

scheduler = MultiStepLR(
    optimizer, list(range(10, 100, 5)), gamma=0.5, last_epoch=-1)

loss_fn = nn.MSELoss()

In [None]:
def valid_phn(audio_output, target):
    valid_token_pred = []
    valid_token_target = []
    # audio_output = audio_output.squeeze(2)
    for i in range(audio_output.shape[0]):
        for j in range(audio_output.shape[1]):
            # only count valid tokens, not padded tokens (represented by negative values)
            if target[i, j] >= 0:
                valid_token_pred.append(audio_output[i, j])
                valid_token_target.append(target[i, j])
    valid_token_target = np.array(valid_token_target)
    valid_token_pred = np.array(valid_token_pred)

    valid_token_mse = np.mean((valid_token_target - valid_token_pred) ** 2)
    valid_token_mae = np.mean(np.abs(valid_token_target - valid_token_pred))
    corr = np.corrcoef(valid_token_pred, valid_token_target)[0, 1]
    return valid_token_mse, valid_token_mae, corr


In [None]:
from tqdm import tqdm
exp_dir = "/data/codes/prep_ps_pykaldi/exp/preds"
ckpt_dir = "/data/codes/prep_ps_pykaldi/exp/ckpts"

global_step = 0
best_mse = 1e5
for epoch in range(50):
    gopt_model.train()
    train_tqdm = tqdm(trainloader, "Training")

    for batch in train_tqdm:
        optimizer.zero_grad()

        features = batch["features"].to(device)
        phone_ids = batch["phone_ids"].to(device)
        phone_labels = batch["phone_scores"].to(device)

        utterance_preds, phone_preds, word_preds = gopt_model(x=features.float(), phn=phone_ids.long())
        
        mask = phone_labels >=0
        phone_preds = phone_preds.squeeze(2)
        phone_preds = phone_preds * mask
        phone_labels = phone_labels * mask
        
        loss_phn = loss_fn(phone_preds, phone_labels)
        loss_phn = loss_phn * (mask.shape[0] * mask.shape[1]) / torch.sum(mask)
        
        loss_phn.backward()
        optimizer.step()
        
        global_step += 1
        train_tqdm.set_postfix(loss_phn=loss_phn.item())
    
    A_phn, A_phn_target = [], []
    for batch in testloader:
        features = batch["features"].to(device)
        phone_ids = batch["phone_ids"].to(device)
        phone_labels = batch["phone_scores"].to(device)
        
        utterance_preds, phone_preds, word_preds = gopt_model(x=features.float(), phn=phone_ids.long())
        
        phone_preds = phone_preds.detach().cpu()
        phone_labels = phone_labels.detach().cpu()
        
        A_phn.append(phone_preds[:, :, 0])
        A_phn_target.append(phone_labels)
        
    A_phn, A_phn_target  = torch.vstack(A_phn), torch.vstack(A_phn_target)

    indices = A_phn_target != -1
    _label = A_phn_target[indices].clone()
    _pred = A_phn[indices].clone()

    converted_pred = convert_score_to_color(_pred).view(-1)
    converted_label = convert_score_to_color(_label).view(-1)

    print("### F1 Score: \n", classification_report(y_true=converted_label, y_pred=converted_pred))

    # valid_token_mse, valid_token_mae, corr
    phn_mse, phn_mae, phn_corr = valid_phn(A_phn, A_phn_target)

    if phn_mse < best_mse:
        best_mse = phn_mse
        print("### Saved predict and label")
        np.save(f'{exp_dir}/pred.npy', A_phn.numpy())
        np.save(f'{exp_dir}/label.npy', A_phn_target.numpy())
        
    print(f"### Validation result (epoch={epoch}): MSE={round(phn_mse, 4)} MAE={round(phn_mae, 4)} PCC={round(phn_corr, 4)}")

In [None]:
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 342.27it/s, loss_phn=0.232] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.90      0.96      0.93    104138
#          1.0       0.32      0.28      0.30     12537
#          2.0       0.73      0.42      0.53     12449

#     accuracy                           0.84    129124
#    macro avg       0.65      0.55      0.59    129124
# weighted avg       0.83      0.84      0.83    129124

# ### Saved predict and label
# ### Validation result (epoch=0): MSE=0.18320000171661377 MAE=0.2574000060558319 PCC=0.7568
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 343.74it/s, loss_phn=0.137] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.93    104138
#          1.0       0.34      0.32      0.33     12537
#          2.0       0.73      0.46      0.57     12449

#     accuracy                           0.85    129124
#    macro avg       0.66      0.58      0.61    129124
# weighted avg       0.84      0.85      0.84    129124

# ### Saved predict and label
# ### Validation result (epoch=1): MSE=0.17579999566078186 MAE=0.26809999346733093 PCC=0.7729
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 342.07it/s, loss_phn=0.175] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.93    104138
#          1.0       0.34      0.34      0.34     12537
#          2.0       0.72      0.50      0.59     12449

#     accuracy                           0.85    129124
#    macro avg       0.66      0.60      0.62    129124
# weighted avg       0.84      0.85      0.84    129124

# ### Saved predict and label
# ### Validation result (epoch=2): MSE=0.1720999926328659 MAE=0.25369998812675476 PCC=0.7801
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 345.56it/s, loss_phn=0.168] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.93    104138
#          1.0       0.34      0.32      0.33     12537
#          2.0       0.68      0.57      0.62     12449

#     accuracy                           0.85    129124
#    macro avg       0.65      0.61      0.63    129124
# weighted avg       0.84      0.85      0.84    129124

# ### Validation result (epoch=3): MSE=0.17890000343322754 MAE=0.25999999046325684 PCC=0.7795
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 344.52it/s, loss_phn=0.0753]
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.93    104138
#          1.0       0.35      0.29      0.32     12537
#          2.0       0.68      0.60      0.64     12449

#     accuracy                           0.85    129124
#    macro avg       0.65      0.61      0.63    129124
# weighted avg       0.84      0.85      0.85    129124

# ### Saved predict and label
# ### Validation result (epoch=4): MSE=0.1695999950170517 MAE=0.2418999969959259 PCC=0.7867
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 344.80it/s, loss_phn=0.119] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.94    104138
#          1.0       0.34      0.33      0.34     12537
#          2.0       0.75      0.47      0.58     12449

#     accuracy                           0.85    129124
#    macro avg       0.67      0.59      0.62    129124
# weighted avg       0.84      0.85      0.84    129124

# ### Saved predict and label
# ### Validation result (epoch=5): MSE=0.16220000386238098 MAE=0.2500999867916107 PCC=0.7904
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 342.43it/s, loss_phn=0.0393]
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.94    104138
#          1.0       0.35      0.29      0.32     12537
#          2.0       0.76      0.50      0.60     12449

#     accuracy                           0.85    129124
#    macro avg       0.67      0.58      0.62    129124
# weighted avg       0.84      0.85      0.84    129124

# ### Saved predict and label
# ### Validation result (epoch=6): MSE=0.15809999406337738 MAE=0.2386000007390976 PCC=0.7935
# Training: 100%|██████████| 18740/18740 [00:55<00:00, 339.46it/s, loss_phn=0.0975]
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.35      0.30      0.32     12537
#          2.0       0.72      0.54      0.62     12449

#     accuracy                           0.85    129124
#    macro avg       0.66      0.60      0.62    129124
# weighted avg       0.84      0.85      0.85    129124

# ### Validation result (epoch=7): MSE=0.16140000522136688 MAE=0.23399999737739563 PCC=0.793
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 341.84it/s, loss_phn=0.13]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.94    104138
#          1.0       0.35      0.31      0.33     12537
#          2.0       0.71      0.56      0.62     12449

#     accuracy                           0.85    129124
#    macro avg       0.66      0.61      0.63    129124
# weighted avg       0.84      0.85      0.85    129124

# ### Validation result (epoch=8): MSE=0.16449999809265137 MAE=0.23659999668598175 PCC=0.7911
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 341.18it/s, loss_phn=0.215]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.94    104138
#          1.0       0.36      0.31      0.33     12537
#          2.0       0.71      0.58      0.64     12449

#     accuracy                           0.86    129124
#    macro avg       0.66      0.61      0.64    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=9): MSE=0.16279999911785126 MAE=0.2371000051498413 PCC=0.7939
# Training: 100%|██████████| 18740/18740 [00:57<00:00, 324.39it/s, loss_phn=0.12]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.36      0.30      0.33     12537
#          2.0       0.72      0.55      0.63     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.61      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=10): MSE=0.16130000352859497 MAE=0.22550000250339508 PCC=0.7938
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 345.07it/s, loss_phn=0.142] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.94    104138
#          1.0       0.36      0.29      0.32     12537
#          2.0       0.75      0.53      0.62     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.59      0.62    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Saved predict and label
# ### Validation result (epoch=11): MSE=0.15600000321865082 MAE=0.2273000031709671 PCC=0.7976
# Training: 100%|██████████| 18740/18740 [00:55<00:00, 339.19it/s, loss_phn=0.156] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.94    104138
#          1.0       0.36      0.34      0.35     12537
#          2.0       0.72      0.57      0.64     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.62      0.64    129124
# weighted avg       0.85      0.86      0.85    129124

# ### Validation result (epoch=12): MSE=0.16580000519752502 MAE=0.25450000166893005 PCC=0.7952
# Training: 100%|██████████| 18740/18740 [00:58<00:00, 323.06it/s, loss_phn=0.163]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.94    104138
#          1.0       0.36      0.29      0.32     12537
#          2.0       0.73      0.56      0.63     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.60      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=13): MSE=0.15639999508857727 MAE=0.23199999332427979 PCC=0.7995
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 341.36it/s, loss_phn=0.163]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.37      0.31      0.34     12537
#          2.0       0.73      0.56      0.63     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.61      0.64    129124
# weighted avg       0.85      0.86      0.85    129124

# ### Validation result (epoch=14): MSE=0.15610000491142273 MAE=0.2280000001192093 PCC=0.8007
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 345.71it/s, loss_phn=0.19]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.94    104138
#          1.0       0.37      0.29      0.32     12537
#          2.0       0.75      0.54      0.63     12449

#     accuracy                           0.86    129124
#    macro avg       0.68      0.60      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Saved predict and label
# ### Validation result (epoch=15): MSE=0.15489999949932098 MAE=0.2207999974489212 PCC=0.7996
# Training: 100%|██████████| 18740/18740 [00:55<00:00, 338.56it/s, loss_phn=0.179] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.94    104138
#          1.0       0.36      0.31      0.34     12537
#          2.0       0.70      0.61      0.65     12449

#     accuracy                           0.86    129124
#    macro avg       0.66      0.62      0.64    129124
# weighted avg       0.85      0.86      0.85    129124

# ### Validation result (epoch=16): MSE=0.16220000386238098 MAE=0.23729999363422394 PCC=0.7977
# Training: 100%|██████████| 18740/18740 [00:55<00:00, 340.35it/s, loss_phn=0.0687] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.36      0.31      0.33     12537
#          2.0       0.74      0.56      0.64     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.61      0.64    129124
# weighted avg       0.85      0.86      0.85    129124

# ### Saved predict and label
# ### Validation result (epoch=17): MSE=0.15449999272823334 MAE=0.23019999265670776 PCC=0.8019
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 343.38it/s, loss_phn=0.211] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.94    104138
#          1.0       0.37      0.29      0.33     12537
#          2.0       0.74      0.55      0.63     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.60      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Saved predict and label
# ### Validation result (epoch=18): MSE=0.1527000069618225 MAE=0.22200000286102295 PCC=0.8025
# Training: 100%|██████████| 18740/18740 [00:54<00:00, 345.40it/s, loss_phn=0.156]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.94    104138
#          1.0       0.37      0.33      0.35     12537
#          2.0       0.72      0.58      0.65     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.62      0.64    129124
# weighted avg       0.85      0.86      0.85    129124

# ### Validation result (epoch=19): MSE=0.15940000116825104 MAE=0.24060000479221344 PCC=0.8
# Training: 100%|██████████| 18740/18740 [00:55<00:00, 338.93it/s, loss_phn=0.133] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.37      0.30      0.33     12537
#          2.0       0.73      0.56      0.64     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.61      0.64    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=20): MSE=0.15620000660419464 MAE=0.23100000619888306 PCC=0.7998
# Training: 100%|██████████| 18740/18740 [00:55<00:00, 339.05it/s, loss_phn=0.337] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.37      0.30      0.33     12537
#          2.0       0.71      0.60      0.65     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.62      0.64    129124
# weighted avg       0.85      0.86      0.85    129124

# ### Validation result (epoch=21): MSE=0.156700000166893 MAE=0.22679999470710754 PCC=0.8013
# Training: 100%|██████████| 18740/18740 [00:56<00:00, 331.85it/s, loss_phn=0.197] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.94    104138
#          1.0       0.36      0.30      0.33     12537
#          2.0       0.71      0.58      0.64     12449

#     accuracy                           0.86    129124
#    macro avg       0.66      0.61      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=22): MSE=0.1615000069141388 MAE=0.2387000024318695 PCC=0.7966
# Training: 100%|██████████| 18740/18740 [00:51<00:00, 362.69it/s, loss_phn=0.355] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.37      0.30      0.33     12537
#          2.0       0.72      0.59      0.65     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.62      0.64    129124
# weighted avg       0.85      0.86      0.85    129124

# ### Validation result (epoch=23): MSE=0.1565999984741211 MAE=0.22200000286102295 PCC=0.801
# Training: 100%|██████████| 18740/18740 [00:50<00:00, 371.22it/s, loss_phn=0.236]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.91      0.96      0.94    104138
#          1.0       0.37      0.30      0.33     12537
#          2.0       0.73      0.57      0.64     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.61      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=24): MSE=0.15569999814033508 MAE=0.2321999967098236 PCC=0.8001
# Training: 100%|██████████| 18740/18740 [00:50<00:00, 369.94it/s, loss_phn=0.055]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.36      0.31      0.34     12537
#          2.0       0.71      0.57      0.63     12449

#     accuracy                           0.86    129124
#    macro avg       0.66      0.61      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=25): MSE=0.16009999811649323 MAE=0.23090000450611115 PCC=0.7975
# Training: 100%|██████████| 18740/18740 [00:51<00:00, 361.52it/s, loss_phn=0.143]  
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.96      0.94    104138
#          1.0       0.37      0.28      0.32     12537
#          2.0       0.71      0.59      0.65     12449

#     accuracy                           0.86    129124
#    macro avg       0.67      0.61      0.63    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=26): MSE=0.15809999406337738 MAE=0.22050000727176666 PCC=0.7996
# Training: 100%|██████████| 18740/18740 [00:50<00:00, 368.01it/s, loss_phn=0.173] 
# ### F1 Score: 
#                precision    recall  f1-score   support

#          0.0       0.92      0.95      0.94    104138
#          1.0       0.36      0.29      0.32     12537
#          2.0       0.67      0.64      0.66     12449

#     accuracy                           0.86    129124
#    macro avg       0.65      0.63      0.64    129124
# weighted avg       0.84      0.86      0.85    129124

# ### Validation result (epoch=27): MSE=0.1648000031709671 MAE=0.2402999997138977 PCC=0.7975
# Training:  40%|████      | 7526/18740 [00:20<00:30, 363.49it/s, loss_phn=0.16]   
# ---------------------------------------------------------------------------
# KeyboardInterrupt                         Traceback (most recent call last)
# /data/codes/prep_ps_pykaldi/notebooks/3mh-gopt-dur-wavlmb_main.ipynb Cell 8 line 1
#      11 for batch in train_tqdm:
#      12     optimizer.zero_grad()
# ---> 14     features = batch["features"].to(device)
#      15     phone_ids = batch["phone_ids"].to(device)
#      16     phone_labels = batch["phone_scores"].to(device)

# KeyboardInterrupt: 
# /data/codes/prep_ps_pykaldi
# ---------------------------------------------------------------------------
# FileNotFoundError                         Traceback (most recent call last)
# /data/codes/prep_ps_pykaldi/notebooks/3mh-gopt-dur-wavlmb_main.ipynb Cell 3 line 4
#      38         return self.parse_data(
#      39             phone_ids=phone_ids,
#      40             phone_scores=phone_scores,
#    (...)
#      43             wavlm_features=wavlm_features
#      44         )
#      46 data_dir = "/data/codes/prep_ps_pykaldi/exp/sm/test"
# ---> 48 phone_ids, phone_scores, durations, gops, wavlm_features = load_data(data_dir)
#      49 dataset_v1 = PrepDataset(phone_ids, phone_scores, durations, gops, wavlm_features)
#      50 dataloader = DataLoader(dataset_v1, batch_size=8)

# /data/codes/prep_ps_pykaldi/notebooks/3mh-gopt-dur-wavlmb_main.ipynb Cell 3 line 1
#      13 def load_data(data_dir):
# ---> 14     phone_ids = np.load(f'{data_dir}/phone_ids.npy')
#      15     phone_scores = np.load(f'{data_dir}/phone_scores.npy')
#      16     durations = np.load(f'{data_dir}/duration.npy')

# File ~/miniconda3/envs/ps/lib/python3.8/site-packages/numpy/lib/npyio.py:390, in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
#     388     own_fid = False
#     389 else:
# --> 390     fid = stack.enter_context(open(os_fspath(file), "rb"))
#     391     own_fid = True
#     393 # Code to distinguish from NumPy binary files and pickles.

# FileNotFoundError: [Errno 2] No such file or directory: '/data/codes/prep_ps_pykaldi/exp/sm/test/phone_ids.npy'