<a href="https://colab.research.google.com/github/tihunn/emotion_testing/blob/main/exp/all_files_from_dasha_expirements.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

metrics.py

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score


def get_metrics_df(pred_class, gt_class, model_name=None):
    metric_dict = calculate_metrics(pred_class=pred_class, gt_class=gt_class)
    metrics_df = pd.DataFrame([metric_dict]).T.round(4)

    if model_name is not None:
        metrics_df.columns = [model_name]

    return metrics_df


def weighted_accuracy(y_true, y_pred, n_classes=4):
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)

    class_accuracies = []
    for i in range(n_classes):
        gt_class_mask = y_true == i
        pred_class_mask = y_pred == i
        class_accuracies.append(
            (gt_class_mask * pred_class_mask).sum() / gt_class_mask.sum()
        )

    return np.mean(class_accuracies)


def calculate_metrics(pred_class, gt_class, **kwargs):
    n_classes = 4

    metrics_dict = {
        "accuracy": accuracy_score(y_true=gt_class, y_pred=pred_class),
        "WA": weighted_accuracy(
            y_true=gt_class, y_pred=pred_class, n_classes=n_classes
        ),
        "f1_macro": f1_score(y_true=gt_class, y_pred=pred_class, average="macro"),
    }

    return metrics_dict

learner.py

In [2]:
import copy
from pathlib import Path
import time

import numpy as np
import torch
from torch import nn
from torch.optim import Adam, lr_scheduler
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm


class Learner:
    def __init__(
        self,
        train_dataset,
        val_dataset,
        dataloaders,
        exp_path,
        model_name,
        model,
        batch_size,
        dump_best_checkpoints,
        dump_last_checkpoints,
        best_checkpoints_warmup,
        cuda_device="cuda:0",
    ):

        self.device = torch.device(cuda_device if torch.cuda.is_available() else "cpu")
        self.model = model
        self.model.to(self.device)

        self.__model_name = model_name

        self.dump_last_checkpoints = dump_last_checkpoints
        self.dump_best_checkpoints = dump_best_checkpoints
        self.best_checkpoints_warmup = best_checkpoints_warmup

        self.exp_path = Path(exp_path)
        if dump_best_checkpoints:
            self.best_checkpoints_path = self.exp_path / "best_checkpoints"
            self.best_checkpoints_path.mkdir()
        if dump_last_checkpoints:
            self.last_checkpoints_path = self.exp_path / (
                self.__model_name + "_last_checkpoints"
            )
            self.last_checkpoints_path.mkdir()

        self.batch_size = batch_size

        self.train_dataset = train_dataset
        self.val_dataset = val_dataset

        print(
            "train labels",
            np.unique(self.train_dataset.df.label.values, return_counts=True),
        )
        print(
            "train weights",
            np.unique(
                self.train_dataset.df.sampling_weights.values, return_counts=True
            ),
        )

        self.dataloaders = dataloaders

        self.dataset_sizes = {
            "train": len(self.train_dataset.df),
            "validate": len(self.val_dataset.df),
        }

    def train(self, num_epochs, lr, step_size, gamma, weight_decay=0, clip_grad=False):
        comment_str_list = [
            "MODEL",
            self.__model_name,
            "EPOCHS",
            str(num_epochs),
            "LR",
            str(lr),
            "BATCH",
            str(self.batch_size),
        ]

        comment_str = "_".join(comment_str_list)
        summary_writer = SummaryWriter(log_dir=self.exp_path / 'TB_log' / comment_str)

        criterion = nn.CrossEntropyLoss()
        optimizer = Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)
        scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

        since = time.time()
        # copy.deepcopy(self.model.state_dict())
        best_model_wts = None
        best_loss = 10000000
        best_acc = best_f1 = best_WA = 0
        softmax = nn.Softmax(dim=1)

        try:
            for epoch in range(1, num_epochs + 1):
                print(f"Epoch {epoch}/{num_epochs}")
                for phase in ["train", "validate"]:
                    if phase == "train":
                        self.model.train()
                        cur_step_lr = scheduler.get_last_lr()[-1]
                    else:
                        self.model.eval()

                    running_loss = 0.0
                    running_outputs = []
                    running_labels = []
                    for inputs, labels in tqdm(self.dataloaders[phase]):
                        inputs = inputs.to(self.device)
                        labels = labels.long()
                        labels = labels.to(self.device)
                        optimizer.zero_grad()

                        with torch.set_grad_enabled(phase == "train"):
                            outputs = self.model(inputs)
                            probs = softmax(outputs)
                            loss = criterion(outputs, labels)
                            if phase == "train":
                                loss.backward()
                                if clip_grad:
                                    torch.nn.utils.clip_grad_norm_(
                                        self.model.parameters(), 1.0
                                    )
                                optimizer.step()

                        running_loss += loss.item()
                        if phase == "validate":
                            running_labels.append(labels)
                            running_outputs.append(probs)

                    if phase == "train":
                        scheduler.step()

                    epoch_loss = running_loss / self.dataset_sizes[phase]
                    if phase == "validate":
                        pred_class = np.argmax(
                            torch.cat(running_outputs).cpu().numpy(), axis=1
                        )
                        gt_class = torch.cat(running_labels).cpu().numpy()

                        metric_dict = calculate_metrics(
                            pred_class, gt_class, neg_label=0
                        )

                        summary_writer.add_scalar("Loss/validate", epoch_loss, epoch)
                        for metric_name, metric_value in metric_dict.items():
                            summary_writer.add_scalar(
                                f"Metrics/{metric_name}", metric_value, epoch
                            )

                        epoch_acc = metric_dict["accuracy"]
                        epoch_f1 = metric_dict["f1_macro"]
                        epoch_WA = metric_dict["WA"]

                        print(f"{phase} Loss: {epoch_loss:.4f}")
                        print(f"{phase} Acc: {epoch_acc:.4f}")
                        print(f"{phase} F1 macro: {epoch_f1:.4f}")
                        print(f"{phase} WA: {epoch_WA:.4f}")

                        if epoch_f1 > best_f1:
                            best_f1 = epoch_f1
                            # best_WA = epoch_WA
                            best_acc = epoch_acc
                            best_f1 = epoch_f1

                            best_epoch = epoch
                            best_model_wts = copy.deepcopy(self.model.state_dict())

                            if (
                                self.dump_best_checkpoints
                                and epoch > self.best_checkpoints_warmup
                            ):
                                torch.save(
                                    best_model_wts,
                                    self.best_checkpoints_path
                                    / f"best_checkpoint_{epoch}",
                                )

                        if self.dump_last_checkpoints and abs(epoch - num_epochs) < 6:
                            torch.save(
                                copy.deepcopy(self.model.state_dict()),
                                self.last_checkpoints_path / f"checkpoint_{epoch}",
                            )

                    else:
                        print(f"{phase} Loss: {epoch_loss:.4f}")
                        summary_writer.add_scalar("Loss/train", epoch_loss, epoch)
                        summary_writer.add_scalar("LR/value", cur_step_lr, epoch)

        except KeyboardInterrupt:
            pass

        summary_writer.flush()
        time_elapsed = time.time() - since
        print(
            f"Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s."
            + f" Best model loss: {best_loss:.6f}, best model acc: {best_acc:.6f}, "
            + f"best model f1: {best_f1:.6f}, best epoch {best_epoch}"
        )

        self.model.load_state_dict(best_model_wts)
        self.model.eval()
        return best_model_wts

datasets.py

In [3]:
from pathlib import Path

import numpy as np
import torch
from torch import nn
from torch.distributions.categorical import Categorical
from torch.utils.data import Dataset, Sampler
from torchaudio import transforms as T
from torchvision import transforms

PATH_TO_TENSOR_COL = "tensor"


def load_tensor(path):
    features_tensor = np.fromfile(path, dtype=np.float32)
    return torch.from_numpy(np.reshape(features_tensor, (-1, 64)))


def pad_or_crop_to_shape(tensor, size, rand_side_pad=True):
    assert len(tensor.shape) == 3
    delta = size - tensor.shape[-1]
    if delta > 0:
        if rand_side_pad:
            start_padding = np.random.randint(delta)
            end_padding = delta - start_padding
            res = nn.functional.pad(tensor, pad=(start_padding, end_padding, 0, 0))
        else:
            res = nn.functional.pad(tensor, pad=(0, delta, 0, 0))

        return res
    else:
        return tensor[..., :size]


def adaptive_padding_collate_fn(batch):
    data = []
    target = []
    max_size = max([tens.shape[-1] for (tens, label) in batch])
    for (tens, label) in batch:
        # crop
        data.append(pad_or_crop_to_shape(tens, max_size, rand_side_pad=True))
        target.append(label)

    return torch.stack(data), torch.tensor(target)


def get_augm_func(time_mask_param=80, freq_mask_param=16, crop_augm_max_cut_size=0):
    """
    Returns function for augmentation in MelEmotionsDataset (augm_transform)
    Returned function's input should have [bs, 1, T] shape

    :param time_mask_param:
    :param freq_mask_param:
    :param crop_augm_max_cut_size: if 0 - random crops are not used
    :return:
    """

    t_masking = T.TimeMasking(time_mask_param=time_mask_param)
    f_masking = T.FrequencyMasking(freq_mask_param=freq_mask_param)

    if crop_augm_max_cut_size != 0:
        # we want random crop with random size,
        # so we should sample crop size for each augm_transform call
        def crop_f(tens):
            crop_delta = np.random.randint(crop_augm_max_cut_size)
            random_crop = transforms.RandomCrop(
                np.array(tens.shape)[1:] - np.array([0, crop_delta])
            )

            return random_crop(tens)

        augm_transform = transforms.Compose([f_masking, t_masking, crop_f])
    else:
        augm_transform = transforms.Compose([f_masking, t_masking])

    return augm_transform


class MelEmotionsDataset(Dataset):
    def __init__(
        self, df, *_, augm_transform=None, get_weights_func=None, base_path=None, **__
    ):
        super().__init__()
        df = df.copy()
        if "label" in df.columns:
            df["label"] = df["label"].apply(int)
        else:
            print('There is no column "label" in the TSV')

        if get_weights_func is None:
            df["sampling_weights"] = 1
        else:
            df["sampling_weights"] = get_weights_func(df)

        # sort by length
        if "wav_length" in df.columns:
            df = df.sort_values("wav_length").reset_index(drop=True)
        else:
            print('There is no column "wav_length" in the TSV')

        self.df = df
        self.augm_transform = augm_transform
        self.feature_col = PATH_TO_TENSOR_COL

        if base_path is not None:
            base_path = Path(base_path)
        self.base_path = base_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        path = self.df.iloc[idx][self.feature_col]
        if self.base_path is not None:
            path = self.base_path / path

        tens = torch.from_numpy(np.load(path))
        label = self.df.iloc[idx]["label"]

        if self.augm_transform is not None:
            tens = self.augm_transform(tens)

        return tens, label


class LengthWeightedSampler(Sampler[int]):
    def __init__(
        self,
        df,
        batch_size,
        min_length=1,
        max_length=20.5,
        length_delta=0.3,
        decimals=1,
    ):
        # df should be sorted ascending by wav_length
        # we do it in MelEmotionsDataset
        if "wav_length" not in df.columns:
            raise ValueError('There is no column "wav_length" in the TSV')

        super().__init__(df)
        self.df = df
        self.batch_size = batch_size
        self.num_samples = (len(df) // batch_size) * batch_size

        all_lengths = np.round(df["wav_length"].values, decimals)
        _max = max(all_lengths)
        _min = min(all_lengths)

        if max_length is None or max_length > _max:
            max_length = _max
        if min_length is None or min_length < _min:
            min_length = _min

        self.min_length = min_length
        self.max_length = max_length
        self.length_delta = length_delta

        self.decimals = decimals
        self.length_step = np.round(0.1 ** decimals, decimals)

        # is needed to sample batches with max length inclusive
        max_plus_delta = np.round(self.max_length + self.length_step, decimals)

        length_to_index_mapping = {}
        temp_length = 0

        for i, v in enumerate(all_lengths):
            if v > temp_length:
                if v != temp_length + self.length_step:
                    for j in np.arange(
                        temp_length + self.length_step, v, self.length_step
                    ):
                        length_to_index_mapping[np.round(j, decimals)] = i

                length_to_index_mapping[v] = i

                temp_length = v

        # fix to sample batches with max length inclusive
        length_to_index_mapping[
            np.round(np.max(all_lengths) + self.length_step, decimals)
        ] = len(df)

        self.length_to_index_mapping = length_to_index_mapping

        # starts with MIN_LENGTH
        self.lengths, self.lengths_count = np.unique(
            all_lengths[
                length_to_index_mapping[self.min_length] : length_to_index_mapping[
                    max_plus_delta
                ]
            ],
            return_counts=True,
        )

        self.key_length_sampler = Categorical(
            probs=torch.from_numpy(self.lengths_count)
        )

    def __iter__(self):
        N = 0
        res_indexes = []

        while N < self.num_samples:
            key_length = self.lengths[self.key_length_sampler.sample().item()]

            batch_min_length = np.round(
                max(self.min_length, key_length - self.length_delta), self.decimals
            )
            batch_max_length = np.round(
                min(self.max_length, key_length + self.length_delta), self.decimals
            )
            batch_max_length_plus_delta = np.round(
                batch_max_length + self.length_step, self.decimals
            )

            sub_df = self.df.iloc[
                self.length_to_index_mapping[
                    batch_min_length
                ] : self.length_to_index_mapping[batch_max_length_plus_delta]
            ][["sampling_weights"]]

            sampling_weights = torch.from_numpy(
                sub_df.sampling_weights.values.astype(float)
            )
            sub_iloc_indexes = torch.multinomial(
                sampling_weights, self.batch_size, True
            ).tolist()

            batch_indexes = sub_df.iloc[sub_iloc_indexes].index.tolist()
            res_indexes.extend(batch_indexes)

            N += self.batch_size

        return iter(res_indexes)

    def __len__(self):
        return self.num_samples

untils.py

In [4]:
import json
from pathlib import Path

import pandas as pd


def parse_name(tsv_name):
    """
    We have names like
    f"predicts_dataset_{dataset_name}_model_{model_name}.tsv" /
    f"metrics_dataset_{dataset_name}_model_{model_name}.csv"

    Returns: dataset_name, model_name
    """

    if tsv_name.startswith("predicts_dataset_"):
        # len('predicts_dataset_') = 17
        _s = tsv_name[17:]
    elif tsv_name.startswith("metrics_dataset_"):
        _s = tsv_name[16:]
    else:
        raise ValueError(f"tsv_name is {tsv_name}")

    model_prefix_start = _s.find("_model_")
    if model_prefix_start == -1:
        raise ValueError(f"tsv_name is {tsv_name}")

    dataset_name = _s[:model_prefix_start]
    model_name = _s[model_prefix_start + len("_model_") : -4]

    return dataset_name, model_name


def raw_parse_dir(exps_path, prefix="predicts"):
    """
    Pars dir with experiments and returns dicts:
        dataset: model: path
        dataset: set of models

    Args:
        exps_path: path to dir with experiments
        prefix: 'predicts' or 'metrics' - what the function should parse
    """
    exps_path = Path(exps_path)

    # get paths to data
    glob_exp = "**/"

    if prefix == "predicts":
        glob_file = "predicts_*.tsv"
    elif prefix == "metrics":
        glob_file = "metrics_*.csv"
    else:
        raise ValueError(
            f"Get prefix = {prefix}, supports only ['predicts', 'metrics']"
        )

    data_paths = list(exps_path.glob(glob_exp + glob_file))

    data_paths = [
        p
        for p in data_paths
        if str(p.name).startswith(prefix)
        and str(p.name).find("dataset_") > -1
        and str(p.name).find("model_") > -1
    ]

    # init our structure
    # dataset: model: path_to_predict
    dataset_models_paths = {}

    # get all models for all datasets

    # dataset: set of model names
    dataset_models_dict = {}
    for curr_path in data_paths:
        dataset_name, model_name = parse_name(str(curr_path.name))
        if dataset_models_dict.get(dataset_name) is None:
            dataset_models_dict[dataset_name] = {model_name}
            dataset_models_paths[dataset_name] = {model_name: curr_path}
        else:
            dataset_models_dict[dataset_name] |= {model_name}
            dataset_models_paths[dataset_name].update({model_name: curr_path})

    return dataset_models_paths, dataset_models_dict


def collect_metrics_to_one(list_of_metrics_df):
    df = list_of_metrics_df[0]
    df.columns = ["", df.columns[-1]]
    df = df.set_index("")

    for curr_metric_df in list_of_metrics_df[1:]:
        _df = curr_metric_df
        _df.columns = ["", _df.columns[-1]]
        _df = _df.set_index("")
        df = df.join(_df)

    df = df.sort_values("f1_macro", axis=1, ascending=False)

    return df


def load_jsonl_as_df(file_name):
    data = []
    with open(file_name, "r") as file1:
        for line1 in file1:
            data.append(json.loads(line1))
    file1.close()
    df = pd.DataFrame.from_records(data)
    if "label" in df.columns:
        df.label = df.label.astype(int)

    return df

model.py

In [5]:
from typing import Union

import torch
from torch import nn
from torchvision.models.mobilenetv2 import InvertedResidual, _make_divisible


class ConvBNReLU(nn.Sequential):
    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1, norm_layer=None):
        padding = (kernel_size - 1) // 2
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        super(ConvBNReLU, self).__init__(
            nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False),
            norm_layer(out_planes),
            nn.ReLU6(inplace=True)
        )


AUDIO_PROBAS = ('audio_neg', 'audio_sad', 'audio_neu', 'audio_pos')
AUDIO_COLS = tuple(["audio_pred"] + list(AUDIO_PROBAS))

EMO2LABEL = {'angry': 0,
             'sad': 1,
             'neutral': 2,
             'positive': 3}


class SoftMaxModel(nn.Module):
    def __init__(self, logits_model: nn.Module):
        super().__init__()
        self.logits_model = logits_model
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.logits_model(x)
        x = self.softmax(x)

        return x


# slightly modified fast.ai implementation
# https://medium.com/mlearning-ai/self-attention-in-convolutional-neural-networks-172d947afc00
class ConvSelfAttention(nn.Module):
    """Self attention layer for `n_channels`."""

    def __init__(self, n_channels):
        super().__init__()
        self.query, self.key, self.value = [
            self._conv(n_channels, c)
            for c in (n_channels // 8, n_channels // 8, n_channels)
        ]
        self.gamma = nn.Parameter(torch.tensor([0.0]))

    def _conv(self, n_in, n_out):
        return nn.Conv1d(n_in, n_out, kernel_size=1, bias=False)

    def forward(self, x):
        # Notation from the paper.
        size = x.size()
        x = x.view(*size[:2], -1)
        f, g, h = self.query(x), self.key(x), self.value(x)
        beta = nn.functional.softmax(torch.bmm(f.transpose(1, 2), g), dim=1)
        o = self.gamma * torch.bmm(h, beta) + x
        return o.view(*size).contiguous()


# see deep_pipe
# https://github.com/neuro-ml/deep_pipe/blob/master/dpipe/layers/shape.py#L48
class Reshape(nn.Module):
    """
    Reshape the incoming tensor to the given ``shape``.

    Parameters
    ----------
    shape: Union[int, str]
        the resulting shape. String values denote indices in the input tensor's shape.

    Examples
    --------
    >>> layer = Reshape('0', '1', 500, 500)
    >>> layer(x)
    >>> # same as
    >>> x.reshape(x.shape[0], x.shape[1], 500, 500)
    """

    def __init__(self, *shape: Union[int, str]):
        super().__init__()
        self.shape = shape

    def forward(self, x: torch.Tensor):
        shape = [x.shape[int(i)] if isinstance(i, str) else i for i in self.shape]
        return x.reshape(*shape)


# see torchvision.models.mobilenetv2.MobileNetV2
class ConvSelfAttentionMobileNet(nn.Module):
    def __init__(self, _config, n_classes, last_channel=128, in_channels=1):

        super().__init__()
        self._config = _config
        self.in_channels = in_channels
        self.n_classes = n_classes
        self.last_channel = last_channel

        block = InvertedResidual
        norm_layer = nn.BatchNorm2d
        width_mult = 1.0
        round_nearest = 8

        input_channel = 4

        features = [
            ConvBNReLU(self.in_channels, input_channel, stride=1, norm_layer=norm_layer)
        ]
        for t, c, n, s in _config:
            output_channel = _make_divisible(c * width_mult, round_nearest)
            for i in range(n):
                stride = s if i == 0 else 1
                features.append(
                    block(
                        input_channel,
                        output_channel,
                        stride,
                        expand_ratio=t,
                        norm_layer=norm_layer,
                    )
                )
                input_channel = output_channel
        # building last several layers
        features.append(
            ConvBNReLU(
                input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer
            )
        )
        # make it nn.Sequential
        self.features = nn.Sequential(*features)

        self.pooling = nn.Sequential(
            ConvSelfAttention(self.last_channel),
            nn.AdaptiveAvgPool2d((1, 1)),
            Reshape("0", self.last_channel),
        )

        self.classifier = nn.Linear(self.last_channel, self.n_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.pooling(x)
        x = self.classifier(x)

        return x

train.py

In [6]:
!pip install lazycon

Collecting lazycon
  Downloading lazycon-0.6.4.tar.gz (21 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: lazycon
  Building wheel for lazycon (pyproject.toml) ... [?25l[?25hdone
  Created wheel for lazycon: filename=lazycon-0.6.4-py3-none-any.whl size=20736 sha256=103b5e1bdd939f1a854dfce0812366e6d2aca9bcfd1c6e64d05655ee52a47782
  Stored in directory: /root/.cache/pip/wheels/ef/40/f1/23fcbc40b224abb6381d572287f72b0e13e3cb2094cc43b3c2
Successfully built lazycon
Installing collected packages: lazycon
Successfully installed lazycon-0.6.4


In [None]:
from pathlib import Path
import random
import shutil

import click
import lazycon
import numpy as np
import torch


# @click.command()
# @click.option(
#     "-config",
#     "--config_path",
#     required=True,
#     type=click.Path(exists=True),
#     help="path to .config file",
# )
# @click.option(
#     "-exp_path",
#     "--exp_path",
#     required=True,
#     type=click.Path(),
#     help="path to dump experiment",
# )
def train_model(config_path, exp_path):
    exp_path = Path(exp_path)
    model_name = exp_path.name
    cfg = lazycon.load(config_path)
    base_path = cfg.base_path
    assert (
        base_path.exists()
    ), f"{base_path} doesn't exist. Correct base_path in configs/data.config"

    exp_path.mkdir(parents=True, exist_ok=True)

    # dump params
    # save compiled config
    cfg.dump(exp_path / "train.config")

    # dump jsonls
    shutil.copy(cfg.train_manifest_path, exp_path / "train.jsonl")
    shutil.copy(cfg.val_manifest_path, exp_path / "val.jsonl")

    model = cfg.model

    # load pretrained model
    if cfg.pt_model_path is not None:
        model.load_state_dict(torch.load(cfg.pt_model_path, map_location="cuda:0"))
        shutil.copy(cfg.pt_model_path, exp_path / "pt_model")

    # init learner
    learner = Learner(
        train_dataset=cfg.train_dataset,
        val_dataset=cfg.val_dataset,
        dataloaders=cfg.dataloaders,
        exp_path=exp_path,
        model_name=model_name,
        model=model,
        batch_size=cfg.batch_size,
        dump_best_checkpoints=cfg.DUMP_BEST_CHECKPOINTS,
        dump_last_checkpoints=cfg.DUMP_LAST_CHECKPOINTS,
        best_checkpoints_warmup=cfg.BEST_CHECKPOINTS_WARMUP,
    )

    # train
    best_model_wts = learner.train(
        num_epochs=cfg.epoch_count,
        lr=cfg.learning_rate,
        step_size=cfg.optimizer_step,
        gamma=cfg.optimizer_gamma,
        weight_decay=cfg.weight_decay,
        clip_grad=cfg.clip_grad,
    )

    # dump best model
    torch.save(best_model_wts, exp_path / model_name)


if __name__ == "__main__":
    # fix seeds for reproducibility
    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)

    train_model()

TypeError: train_model() missing 2 required positional arguments: 'config_path' and 'exp_path'

inf.py

In [None]:
import os
from pathlib import Path

import click
import lazycon
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

# from core.metrics import get_metrics_df
# from core.model import AUDIO_COLS, SoftMaxModel
# from core.utils import collect_metrics_to_one, load_jsonl_as_df, raw_parse_dir

DEVICE = "cuda:0"


def run_single_inf(exp_path, test_manifest, with_metrics, recalculate, device):
    # parse exp_path
    # it may be exp path or path to model
    if os.path.isdir(exp_path):
        dir_path = exp_path
        _path = Path(exp_path)
        model_path = _path / _path.name
    else:
        dir_path = os.path.dirname(exp_path)
        model_path = Path(exp_path)

    dir_path = Path(dir_path)
    model_name = model_path.name

    # check the config
    config_path = dir_path / "train.config"
    assert os.path.exists(config_path), f"No train.config in {dir_path}"

    # check the model
    if not os.path.exists(model_path):
        print(f"There is no saved model {model_path}. Nothing to inference")
        return None

    # load the model
    cfg = lazycon.load(config_path)
    model = cfg.model
    try:
        model.to(device)
        model.load_state_dict(torch.load(model_path))
        model.eval()
    except Exception as exception:
        print(f"Problem with loading model {model_path}. Skipped")
        print(exception)
        return None

    # add last layer SoftMax to predict probabilities
    model = SoftMaxModel(model)

    # create predicts and metrics paths
    predicts_path = Path(dir_path) / "predicts"
    metrics_path = Path(dir_path) / "metrics"

    predicts_path.mkdir(exist_ok=True)
    metrics_path.mkdir(exist_ok=True)

    # parse --vm folder/df
    paths_to_inf = []

    if os.path.isdir(test_manifest):
        paths_to_inf = list(Path(test_manifest).glob("*.jsonl"))
    else:
        paths_to_inf.append(test_manifest)

    assert len(paths_to_inf) > 0, f"No .jsonl here: {test_manifest}"

    # iterate over datasets for inference
    for dataset_df_path in paths_to_inf:
        dataset_df_path = Path(dataset_df_path)
        dataset_df = load_jsonl_as_df(dataset_df_path)
        # dataset_df = pd.read_csv(dataset_df_path, sep='\t')
        dataset_name = str(dataset_df_path.name).split(".", maxsplit=1)[0]
        if with_metrics:
            assert (
                "label" in dataset_df.columns
            ), f"{dataset_name} hasn't 'label' column, but --with_metrics"

        # predict
        predicts_tsv_path = (
            predicts_path / f"predicts_dataset_{dataset_name}_model_{model_name}.tsv"
        )

        # if predicts exist and we don't want to recalculate it, but want to calculate metrics
        if os.path.exists(predicts_tsv_path) and not recalculate:
            if with_metrics:
                metrics_csv_path = (
                    metrics_path
                    / f"metrics_dataset_{dataset_name}_model_{model_name}.csv"
                )
                if not os.path.exists(metrics_csv_path):
                    print(
                        f"Predicts for {model_name} {dataset_name} exist. Calculating metrics"
                    )
                    pred_df = pd.read_csv(predicts_tsv_path, sep="\t")

                    pred_class = pred_df[AUDIO_COLS[0]].values
                    gt_class = pred_df["label"].values

                    metrics_df = get_metrics_df(
                        pred_class=pred_class, gt_class=gt_class, model_name=model_name
                    )

                    metrics_df.to_csv(metrics_csv_path)
                else:
                    print(
                        f"Predicts and metrics for {model_name} {dataset_name} exist. Skipped"
                    )
            else:
                print(
                    f"Predicts for {model_name} {dataset_name} are existed"
                    + "--no_metrics, so metrics calculation is skipped"
                )
            continue

        # calculate predicts
        running_outputs = []
        ds = cfg.get_val_dataset(_df=dataset_df, ds_base_path=dataset_df_path.parent)
        dataloader = cfg.get_val_dataloader(val_ds=ds)

        print(f"Calculating predicts and metrics: {model_name} {dataset_name}")
        for inputs, _ in tqdm(dataloader):
            inputs = inputs.to(device)
            with torch.no_grad():
                probs = model(inputs)

            running_outputs.append(probs)

        # MelEmotionsDataset changes order in df, so we should match predicts by id
        _df = ds.df.copy()
        pred_class = np.argmax(torch.cat(running_outputs).cpu().numpy(), axis=1)
        probas = torch.cat(running_outputs).cpu().numpy()

        _df[AUDIO_COLS[0]] = pred_class
        for i in range(4):
            _df[AUDIO_COLS[i + 1]] = probas[:, i]

        # match preds by id
        pred_df = dataset_df.copy()
        _df = _df.set_index("id").loc[pred_df.id]
        for _col in AUDIO_COLS:
            pred_df[_col] = _df[_col].values

        pred_df.to_csv(predicts_tsv_path, index=False, sep="\t")

        # calculate metrics
        if with_metrics:
            metrics_csv_path = (
                metrics_path / f"metrics_dataset_{dataset_name}_model_{model_name}.csv"
            )

            pred_class = pred_df[AUDIO_COLS[0]].values
            gt_class = pred_df["label"].values

            metrics_df = get_metrics_df(
                pred_class=pred_class, gt_class=gt_class, model_name=model_name
            )

            metrics_df.to_csv(metrics_csv_path)


@click.command()
@click.option(
    "-exps_path",
    "--exps_path",
    required=True,
    type=click.Path(exists=True),
    help="path folder with experiment folders (the experiment folder must have train.config file in)",
)
@click.option(
    "-vm",
    "--test_manifest",
    required=True,
    type=click.Path(exists=True),
    help="path to JSONL file/dir of JSONLs to inference",
)
@click.option(
    "--with_metrics/--no_metrics",
    default=True,
    help="calculate metrics for experiments",
)
@click.option(
    "--recalculate/--no_recalculate",
    default=False,
    help="recalculate existed predicts and metrics",
)
@click.option(
    "--recalculate_dataset_metrics/--no_dataset_metrics",
    default=True,
    help="recalculate existed grouped by dataset metrics",
)
@click.option(
    "-device", "--device", type=click.STRING, default=DEVICE, help="device to inference"
)
def run_inf(
    exps_path,
    test_manifest,
    with_metrics,
    recalculate,
    recalculate_dataset_metrics,
    device,
):
    # parse folder, find experiments folders
    exps_path = Path(exps_path)
    experiment_paths = [p.parent for p in exps_path.glob("**/train.config")]

    # predict and calc metrics for a single experiment
    for exp_path in experiment_paths:
        run_single_inf(
            exp_path=exp_path,
            test_manifest=test_manifest,
            with_metrics=with_metrics,
            recalculate=recalculate,
            device=device,
        )

    # aggregate metrics
    metrics_dump_dir = exps_path / "metrics"
    metrics_dump_dir.mkdir(exist_ok=True)

    if recalculate_dataset_metrics:
        print("Aggregating metrics")
        dataset_models_paths, dataset_models = raw_parse_dir(
            exps_path=exps_path, prefix="metrics"
        )
        datasets = sorted(dataset_models.keys())
        for dataset_name in datasets:
            metric_dump_dir = metrics_dump_dir / f"exps_{dataset_name}.csv"
            metric_df = collect_metrics_to_one(
                [
                    pd.read_csv(metrics_df_path)
                    for metrics_df_path in dataset_models_paths[dataset_name].values()
                ]
            ).T
            metric_df.to_csv(metric_dump_dir)
    else:
        print("--no_dataset_metrics, so metrics grouped by dataset are skipped")

    agg_metrics_paths = list(metrics_dump_dir.glob("*.csv"))
    if len(agg_metrics_paths) == 0:
        print("There is no grouped by dataset metrics")
    else:
        for agg_metrics_path in agg_metrics_paths:
            # remove exps_ and .csv in aggregated metrics df name
            dataset_name = str(agg_metrics_path.name)[5:-4]
            metric_df = pd.read_csv(agg_metrics_path).set_index("Unnamed: 0")
            metric_df.index.name = ""
            print("DATASET: ", dataset_name)
            print(metric_df)
            print("------------------------------------------------")


if __name__ == "__main__":
    run_inf()

ModuleNotFoundError: No module named 'lazycon'

base.config

In [7]:
from torch.utils.data import DataLoader
from pathlib import Path
base_path = Path('/workspace/data/paper_setups')

# train data
train_manifest_path = base_path / 'train' / 'crowd_train.jsonl'
val_manifest_path = base_path / 'tests' / 'crowd_test.jsonl'

# pretrain
pt_model_path = None

# exp hyperparams
batch_size = 64
epoch_count = 100
learning_rate = 5e-4
optimizer_step = 5
optimizer_gamma = 1
weight_decay = 1e-6
clip_grad = False

# augm and batch iter stuff
collate_fn = adaptive_padding_collate_fn
augm_func = get_augm_func(time_mask_param=40, freq_mask_param=16, crop_augm_max_cut_size=40)

MAX_LENGTH = 16
get_train_weights = None

# model
model_setting = [
    # t, c, n, s
    [1, 16, 1, 1],
    [2, 32, 2, 2],
    [2, 64, 6, 2],
    [2, 128, 6, 2],
]

model = ConvSelfAttentionMobileNet(model_setting,
                                   n_classes=4,
                                   last_channel=128)


def get_train_dataset(_df, ds_base_path):
    return MelEmotionsDataset(_df,
                              get_weights_func=get_train_weights,
                              augm_transform=augm_func,
                              base_path=ds_base_path)


def get_val_dataset(_df, ds_base_path):
    return MelEmotionsDataset(_df, base_path=ds_base_path)


def get_train_dataloader(train_ds):
    return DataLoader(train_ds, batch_size=batch_size, num_workers=1,
                      collate_fn=collate_fn,
                      sampler=LengthWeightedSampler(df=train_ds.df,
                                                    batch_size=batch_size,
                                                    min_length=0.3,
                                                    max_length=MAX_LENGTH,
                                                    length_delta=0.3,
                                                    decimals=1))


def get_val_dataloader(val_ds):
    return DataLoader(val_ds, batch_size=1, num_workers=4, shuffle=False)


train_dataset = get_train_dataset(load_jsonl_as_df(train_manifest_path),
                                  ds_base_path=train_manifest_path.parent)
val_dataset = get_val_dataset(load_jsonl_as_df(val_manifest_path),
                              ds_base_path=val_manifest_path.parent)

dataloaders = {'train': get_train_dataloader(train_ds=train_dataset),
               'validate': get_val_dataloader(val_ds=val_dataset)}

DUMP_BEST_CHECKPOINTS = True
DUMP_LAST_CHECKPOINTS = True
BEST_CHECKPOINTS_WARMUP = 5

FileNotFoundError: [Errno 2] No such file or directory: '/workspace/data/paper_setups/train/crowd_train.jsonl'

crown_largd.config

In [None]:
# data
train_manifest_path = base_path / 'crowd_large.jsonl'
val_manifest_path = base_path / 'test' / 'crowd_test.jsonl'

# pretrain
pt_model_path = None

# exp hyperparams
batch_size = 64
epoch_count = 100
learning_rate = 1e-3
optimizer_step = 5
optimizer_gamma = 1
weight_decay = 1e-6
clip_grad = False