In [2]:
import os
import torch
import torchaudio
import boto3
import botocore

In [3]:
# check the torch and torchaudio version
print('torch version: {}'.format(torch.__version__))
print('torchaudio version: {}'.format(torchaudio.__version__))

torch version: 1.10.1+cu102
torchaudio version: 0.10.1+cu102


In [4]:
# specify the bucket name
bucket_name = 'zge-exp'

# specify the S3 file path to the file to be downloaded
file_path_s3 = 'data/cv-corpus-8.0-2022-01-19/fr/clips/common_voice_fr_27042964.mp3'

In [5]:
# setup s3 client
s3_client = boto3.client('s3')

In [6]:
# load audio file from S3 (without downloading)
response = s3_client.get_object(Bucket=bucket_name, Key=file_path_s3)
waveform, sample_rate = torchaudio.load(response['Body'], format='mp3')

In [7]:
print(waveform)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  1.5378e-05,
         -3.3975e-06, -1.8001e-05]])


In [8]:
# setup work dir
dir_sb = '/root/speechbrain'
dir_work = os.path.join(dir_sb, 'recipes/CommonVoice')
os.chdir(dir_work)
print('current dir: {}'.format(os.getcwd()))

current dir: /root/speechbrain/recipes/CommonVoice


In [9]:
import sys
import torch
import logging
import speechbrain as sb
import torchaudio
from hyperpyyaml import load_hyperpyyaml
from speechbrain.tokenizers.SentencePiece import SentencePiece
from speechbrain.utils.data_utils import undo_padding
from speechbrain.utils.distributed import run_on_main

In [15]:
logger = logging.getLogger(__name__)


# Define training procedure
class ASR(sb.core.Brain):
    def compute_forward(self, batch, stage):
        """Forward computations from the waveform batches to the output probabilities."""

        batch = batch.to(self.device)
        wavs, wav_lens = batch.sig
        tokens_bos, _ = batch.tokens_bos
        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)

        # Forward pass
        feats = self.hparams.compute_features(wavs)
        feats = self.modules.normalize(feats, wav_lens)

        ## Add augmentation if specified
        if stage == sb.Stage.TRAIN:
            if hasattr(self.hparams, "augmentation"):
                feats = self.hparams.augmentation(feats)

        x = self.modules.enc(feats.detach())
        e_in = self.modules.emb(tokens_bos)  # y_in bos + tokens
        h, _ = self.modules.dec(e_in, x, wav_lens)
        # Output layer for seq2seq log-probabilities
        logits = self.modules.seq_lin(h)
        p_seq = self.hparams.log_softmax(logits)

        # Compute outputs
        if stage == sb.Stage.TRAIN:
            current_epoch = self.hparams.epoch_counter.current
            if current_epoch <= self.hparams.number_of_ctc_epochs:
                # Output layer for ctc log-probabilities
                logits = self.modules.ctc_lin(x)
                p_ctc = self.hparams.log_softmax(logits)
                return p_ctc, p_seq, wav_lens
            else:
                return p_seq, wav_lens
        else:
            p_tokens, scores = self.hparams.beam_searcher(x, wav_lens)
            return p_seq, wav_lens, p_tokens

    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss (CTC+NLL) given predictions and targets."""

        current_epoch = self.hparams.epoch_counter.current
        if stage == sb.Stage.TRAIN:
            if current_epoch <= self.hparams.number_of_ctc_epochs:
                p_ctc, p_seq, wav_lens = predictions
            else:
                p_seq, wav_lens = predictions
        else:
            p_seq, wav_lens, predicted_tokens = predictions

        ids = batch.id
        tokens_eos, tokens_eos_lens = batch.tokens_eos
        tokens, tokens_lens = batch.tokens

        loss_seq = self.hparams.seq_cost(
            p_seq, tokens_eos, length=tokens_eos_lens
        )

        # Add ctc loss if necessary
        if (
            stage == sb.Stage.TRAIN
            and current_epoch <= self.hparams.number_of_ctc_epochs
        ):
            loss_ctc = self.hparams.ctc_cost(
                p_ctc, tokens, wav_lens, tokens_lens
            )
            loss = self.hparams.ctc_weight * loss_ctc
            loss += (1 - self.hparams.ctc_weight) * loss_seq
        else:
            loss = loss_seq

        if stage != sb.Stage.TRAIN:
            # Decode token terms to words
            predicted_words = self.tokenizer(
                predicted_tokens, task="decode_from_list"
            )

            # Convert indices to words
            target_words = undo_padding(tokens, tokens_lens)
            target_words = self.tokenizer(target_words, task="decode_from_list")

            self.wer_metric.append(ids, predicted_words, target_words)
            self.cer_metric.append(ids, predicted_words, target_words)

        return loss

    def fit_batch(self, batch):
        """Train the parameters given a single batch in input"""
        predictions = self.compute_forward(batch, sb.Stage.TRAIN)
        loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN)
        loss.backward()
        if self.check_gradients(loss):
            self.optimizer.step()
        self.optimizer.zero_grad()
        return loss.detach()

    def evaluate_batch(self, batch, stage):
        """Computations needed for validation/test batches"""
        predictions = self.compute_forward(batch, stage=stage)
        with torch.no_grad():
            loss = self.compute_objectives(predictions, batch, stage=stage)
        return loss.detach()

    def on_stage_start(self, stage, epoch):
        """Gets called at the beginning of each epoch"""
        if stage != sb.Stage.TRAIN:
            self.cer_metric = self.hparams.cer_computer()
            self.wer_metric = self.hparams.error_rate_computer()

    def on_stage_end(self, stage, stage_loss, epoch):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["CER"] = self.cer_metric.summarize("error_rate")
            stage_stats["WER"] = self.wer_metric.summarize("error_rate")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["loss"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"WER": stage_stats["WER"]}, min_keys=["WER"],
            )
        elif stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                stats_meta={"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )
            with open(self.hparams.wer_file, "w") as w:
                self.wer_metric.write_stats(w)


# Define custom data procedure
def dataio_prepare(hparams, tokenizer):
    """This function prepares the datasets to be used in the brain class.
    It also defines the data processing pipeline through user-defined functions."""

    # 1. Define datasets
    data_folder = hparams["data_folder"]

    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_csv"], replacements={"data_root": data_folder},
    )

    if hparams["sorting"] == "ascending":
        # we sort training data to speed up training and get better results.
        train_data = train_data.filtered_sorted(
            sort_key="duration",
            key_max_value={"duration": hparams["avoid_if_longer_than"]},
        )
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["dataloader_options"]["shuffle"] = False

    elif hparams["sorting"] == "descending":
        train_data = train_data.filtered_sorted(
            sort_key="duration",
            reverse=True,
            key_max_value={"duration": hparams["avoid_if_longer_than"]},
        )
        # when sorting do not shuffle in dataloader ! otherwise is pointless
        hparams["dataloader_options"]["shuffle"] = False

    elif hparams["sorting"] == "random":
        pass

    else:
        raise NotImplementedError(
            "sorting must be random, ascending or descending"
        )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_csv"], replacements={"data_root": data_folder},
    )
    # We also sort the validation data so it is faster to validate
    valid_data = valid_data.filtered_sorted(sort_key="duration")

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_csv"], replacements={"data_root": data_folder},
    )

    # We also sort the validation data so it is faster to validate
    test_data = test_data.filtered_sorted(sort_key="duration")

    datasets = [train_data, valid_data, test_data]

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav):
        sig, info = sb.dataio.dataio.read_audio(wav)
        if info["num_channels"] > 1:
            sig = torch.mean(sig, dim=1)
        resampled = torchaudio.transforms.Resample(
            info["sample_rate"], hparams["sample_rate"],
        )(sig)
        return resampled

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("wrd")
    @sb.utils.data_pipeline.provides(
        "tokens_list", "tokens_bos", "tokens_eos", "tokens"
    )
    def text_pipeline(wrd):
        tokens_list = tokenizer.sp.encode_as_ids(wrd)
        #tokens_list = tokenizer.encode_as_ids(wrd)
        yield tokens_list
        tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
        yield tokens_bos
        tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
        yield tokens_eos
        tokens = torch.LongTensor(tokens_list)
        yield tokens

    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "tokens_bos", "tokens_eos", "tokens"],
    )
    return train_data, valid_data, test_data


In [10]:
argvs = ['ASR/seq2seq/hparams/train_fr_exp_cv_with_ots.yml', '--batch_size=2']
print('arguments: {}'.format(argvs))
hparams_file, run_opts, overrides = sb.parse_arguments(argvs)

arguments: ['ASR/seq2seq/hparams/train_fr_exp_cv_with_ots.yml', '--batch_size=2']


In [11]:
with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin, overrides)

In [12]:
# If distributed_launch=True then
# create ddp_group with the right communication protocol
sb.utils.distributed.ddp_init_group(run_opts)

# Dataset preparation (parsing CommonVoice)
# from common_voice_prepare_fr import prepare_common_voice  # noqa

# Create experiment directory
sb.create_experiment_directory(
    experiment_directory=hparams["output_folder"],
    hyperparams_to_save=hparams_file,
    overrides=overrides,
)

speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/cv_with_ots_fr_seq2seq_ctc_attention_now2v_nosmooth


In [13]:
# Defining tokenizer and loading it
tokenizer = SentencePiece(
    model_dir=hparams["save_folder"],
    vocab_size=hparams["output_neurons"],
    annotation_train=hparams["train_csv"],
    annotation_read="wrd",
    model_type=hparams["token_type"],
    character_coverage=hparams["character_coverage"],
)

speechbrain.tokenizers.SentencePiece - Tokenizer is already trained.
speechbrain.tokenizers.SentencePiece - ==== Loading Tokenizer ===
speechbrain.tokenizers.SentencePiece - Tokenizer path: results/cv_with_ots_fr_seq2seq_ctc_attention_now2v_nosmooth/save/500_unigram.model
speechbrain.tokenizers.SentencePiece - Tokenizer vocab_size: 500
speechbrain.tokenizers.SentencePiece - Tokenizer type: unigram


In [16]:
# Create the datasets objects as well as tokenization and encoding :-D
train_data, valid_data, test_data = dataio_prepare(hparams, tokenizer)

In [17]:
sig = sb.dataio.dataio.read_audio(train_data.data[train_data.data_ids[3]]['wav'])
sig[8000:8100]

tensor([-1.0192e-05, -6.7353e-06, -4.1723e-07,  2.3842e-06,  1.6093e-06,
         1.3709e-06,  2.2650e-06,  3.5763e-06,  2.6226e-06, -2.0266e-06,
        -3.9935e-06,  3.5763e-07,  2.9206e-06,  1.7881e-07, -2.3842e-06,
        -4.4703e-06, -7.2718e-06, -8.2254e-06, -9.1195e-06, -1.0669e-05,
        -7.9274e-06, -4.1127e-06, -5.6028e-06, -6.0797e-06, -2.9802e-07,
         4.3511e-06,  3.8743e-06,  3.2187e-06,  5.3644e-06,  9.4175e-06,
         1.1623e-05,  6.1989e-06, -6.7353e-06, -1.7941e-05, -2.0146e-05,
        -1.6868e-05, -1.2994e-05, -6.3181e-06,  2.5034e-06,  5.7220e-06,
         6.5565e-06,  1.2040e-05,  1.3173e-05,  5.1856e-06,  1.1921e-06,
         1.4901e-06, -2.5630e-06, -2.3842e-07,  1.2875e-05,  1.8179e-05,
         1.4067e-05,  1.1921e-05,  5.0068e-06, -4.8876e-06,  1.1921e-07,
         1.2517e-05,  1.1504e-05,  2.7418e-06, -5.9605e-07,  8.3447e-07,
         3.5763e-06,  2.9802e-06, -4.5896e-06, -1.5318e-05, -3.1710e-05,
        -5.8770e-05, -8.3447e-05, -9.2924e-05, -9.2

In [18]:
file_path_bucket = 's3://{}'.format(os.path.join(bucket_name, file_path_s3))
print('s3 bucket path: {}'.format(file_path_bucket))

s3 bucket path: s3://zge-exp/data/cv-corpus-8.0-2022-01-19/fr/clips/common_voice_fr_27042964.mp3


In [19]:
waveforms_obj = file_path_bucket
s3_prefix = 's3://'
len_s3_prefix = len(s3_prefix)
bucket_name = waveforms_obj[len(s3_prefix):].split(os.sep)[0]
len_bucket_name = len(bucket_name)
s3_path = waveforms_obj[len_s3_prefix+len_bucket_name+len(os.sep):]
ext = os.path.splitext(s3_path)[1]
print('bucket name: {}'.format(bucket_name))
print('S3 path: {}'.format(s3_path))
print('ext: {}'.format(ext[1:]))

bucket name: zge-exp
S3 path: data/cv-corpus-8.0-2022-01-19/fr/clips/common_voice_fr_27042964.mp3
ext: mp3


In [20]:
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket_name, Key=s3_path)
audio, sr = torchaudio.load(response['Body'], format=ext[1:])
print('audio shape: {}'.format(audio.shape))
print('sample rate: {}'.format(sample_rate))

audio shape: torch.Size([1, 193536])
sample rate: 32000


In [21]:
sig = audio.transpose(0, 1).squeeze(1)
print(sig[8000:8100])

tensor([ 6.8247e-05,  8.0228e-05,  4.6909e-05, -6.5565e-07,  1.7703e-05,
         3.2663e-05, -1.1206e-05, -1.6510e-05,  1.8716e-05,  1.0967e-05,
        -5.9605e-07,  1.0848e-05, -1.3113e-06, -1.3590e-05,  1.6689e-06,
         1.6093e-05,  1.7524e-05,  1.1206e-05,  2.6286e-05,  6.2048e-05,
         4.1842e-05, -1.1742e-05, -7.1526e-07,  1.7703e-05, -1.1206e-05,
        -1.0014e-05,  2.0862e-06, -3.1650e-05, -4.5896e-05, -2.7776e-05,
        -2.3901e-05, -1.3411e-05, -1.2517e-06, -6.0201e-06, -5.0664e-06,
        -1.7583e-05, -2.7895e-05,  2.1994e-05,  6.6876e-05,  4.8637e-05,
         2.9445e-05,  2.1458e-05,  1.7464e-05,  4.0650e-05,  2.9683e-05,
        -1.6451e-05, -1.1563e-05, -2.0742e-05, -7.2062e-05, -2.7359e-05,
         4.7803e-05, -5.3644e-07, -4.1544e-05,  1.8477e-05,  4.4286e-05,
         2.2531e-05,  2.2888e-05, -1.0729e-06, -3.9518e-05, -3.7968e-05,
        -4.6134e-05, -6.4611e-05, -2.5570e-05,  1.8060e-05, -8.5235e-06,
        -5.4538e-05, -7.4446e-05, -7.7188e-05, -4.0

In [22]:
sig, info = sb.dataio.dataio.read_audio(file_path_bucket, return_info=True)
print('info: {}'.format(info))
print(sig[8000:8100])

info: {'sample_rate': 32000, 'num_channels': 1}
tensor([ 6.8247e-05,  8.0228e-05,  4.6909e-05, -6.5565e-07,  1.7703e-05,
         3.2663e-05, -1.1206e-05, -1.6510e-05,  1.8716e-05,  1.0967e-05,
        -5.9605e-07,  1.0848e-05, -1.3113e-06, -1.3590e-05,  1.6689e-06,
         1.6093e-05,  1.7524e-05,  1.1206e-05,  2.6286e-05,  6.2048e-05,
         4.1842e-05, -1.1742e-05, -7.1526e-07,  1.7703e-05, -1.1206e-05,
        -1.0014e-05,  2.0862e-06, -3.1650e-05, -4.5896e-05, -2.7776e-05,
        -2.3901e-05, -1.3411e-05, -1.2517e-06, -6.0201e-06, -5.0664e-06,
        -1.7583e-05, -2.7895e-05,  2.1994e-05,  6.6876e-05,  4.8637e-05,
         2.9445e-05,  2.1458e-05,  1.7464e-05,  4.0650e-05,  2.9683e-05,
        -1.6451e-05, -1.1563e-05, -2.0742e-05, -7.2062e-05, -2.7359e-05,
         4.7803e-05, -5.3644e-07, -4.1544e-05,  1.8477e-05,  4.4286e-05,
         2.2531e-05,  2.2888e-05, -1.0729e-06, -3.9518e-05, -3.7968e-05,
        -4.6134e-05, -6.4611e-05, -2.5570e-05,  1.8060e-05, -8.5235e-06,
   

In [23]:
csv_path = hparams["test_csv"]
data_folder = hparams['data_folder']
replacements={"data_root": data_folder}
csv_path

'../../templates/speech_recognition/filelists/cv_with_ots/old/test_tiny.csv'

In [36]:
import csv
import re
csvfile = open(csv_path, newline='')
result = {}
reader = csv.DictReader(csvfile, skipinitialspace=True)
variable_finder = re.compile(r"\$([\w.]+)")
for row in reader:
    break

In [37]:
row

{'ID': 'common_voice_fr_19611490',
 'duration': '4.56',
 'wav': '$data_root/CommonVoice/cv-corpus-8.0-2022-01-19/fr/clips/common_voice_fr_19611490.mp3',
 'spk_id': '2faccf61dc761c9405b237051807fe80e8c8d0507da7998b11327da03217e892d3a9b130d9ffca8216a0a8a8944a95aaa6d7e0978bcfcf0413c4dbbfccdcd4e5',
 'wrd': 'LES DEUX FRÈRES ÉTABLISSENT LEUR RÉSIDENCE À OHLAU'}

In [40]:
csv_path

'../../templates/speech_recognition/filelists/cv_with_ots/old/test_tiny.csv'

In [41]:
from smart_open import smart_open

ModuleNotFoundError: No module named 'smart_open'

In [42]:
reader

<csv.DictReader at 0x7f15f2e9c160>