From db3f88113c59306267f15492b4a35030849cb9ef Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Tue, 18 Jul 2023 21:27:37 +0800 Subject: [PATCH 1/9] update the plda codes --- README.md | 1 + ROADMAP.md | 22 ++ examples/__init__.py | 0 examples/sre/__init__.py | 0 examples/sre/v2/README.md | 25 +- examples/sre/v2/__init__.py | 0 examples/sre/v2/local/score_plda.sh | 88 +++++++ wespeaker/__init__.py | 0 wespeaker/bin/adapt_plda.py | 58 +++++ wespeaker/bin/eval_plda.py | 27 +-- wespeaker/bin/train_plda.py | 5 +- wespeaker/utils/plda/kaldi_utils.py | 150 ++++++++++++ wespeaker/utils/plda/plda_utils.py | 41 +++- wespeaker/utils/plda/two_cov_plda.py | 344 +++++++++++++++++++-------- wespeaker/utils/utils.py | 7 +- 15 files changed, 636 insertions(+), 132 deletions(-) create mode 100644 examples/__init__.py create mode 100644 examples/sre/__init__.py create mode 100644 examples/sre/v2/__init__.py create mode 100644 examples/sre/v2/local/score_plda.sh create mode 100644 wespeaker/__init__.py create mode 100644 wespeaker/bin/adapt_plda.py create mode 100644 wespeaker/utils/plda/kaldi_utils.py diff --git a/README.md b/README.md index 3aed4e14..360804ff 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ pip3 install wespeakerruntime ``` ## 🔥 News +* 2023.07.18: Support the kaldi-compatible PLDA and unsupervised adaptation, see [#178](https://github.com/wenet-e2e/wespeaker/pull/178). * 2023.07.14: Support the [NIST SRE16 recipe](https://www.nist.gov/itl/iad/mig/speaker-recognition-evaluation-2016), see [#177](https://github.com/wenet-e2e/wespeaker/pull/177). * 2023.07.10: Support the [Self-Supervised Learning recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v3) on Voxceleb, including [DINO](https://openaccess.thecvf.com/content/ICCV2021/papers/Caron_Emerging_Properties_in_Self-Supervised_Vision_Transformers_ICCV_2021_paper.pdf), [MoCo](https://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf) and [SimCLR](http://proceedings.mlr.press/v119/chen20j/chen20j.pdf), see [#180](https://github.com/wenet-e2e/wespeaker/pull/180). diff --git a/ROADMAP.md b/ROADMAP.md index abf10581..2d37d986 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,4 +1,26 @@ # Wespeaker Roadmap + + +## Version 2.0 (Time: 2023.09) +This is the roadmap for wespeaker version 2.0. + + +- [ ] SSL support + - [ ] Algorithms + - [x] DINO + - [x] MOCO + - [x] SimCLR + - [ ] Iteratively psudo label prediction and supervised finetuning + - [ ] Recipes + - [x] VoxCeleb + - [ ] WenetSpeech + +- [ ] Recipes + - [ ] 3D-speaker + - [ ] NIST SRE + - [x] SRE16 + - [ ] SRE18 + ## Version 1.0 (Time: 2022.09) This is the roadmap for wespeaker version 1.0. diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/sre/__init__.py b/examples/sre/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/sre/v2/README.md b/examples/sre/v2/README.md index 8f97974a..95be9227 100644 --- a/examples/sre/v2/README.md +++ b/examples/sre/v2/README.md @@ -4,18 +4,17 @@ * Scoring: cosine & PLDA & PLDA Adaptation * Metric: EER(%) -Without PLDA training data augmentation: -| Model | Params | Backend | Pooled | Tagalog | Cantonese | -|:------|:------:|:------------:|:------------:|:------------:|:------------:| -| ResNet34-TSTP-emb256 | 6.63M | Cosine | 15.4 | 19.82 | 10.39 | -| | | PLDA | 9.36 | 14.26 | 4.513 | -| | | Adapt PLDA | 6.608 | 10.01 | 2.974 | +| Model | Params | Backend | Pooled | Tagalog | Cantonese | +|:---------------------|:------:|:----------:|:------:|:-------:|:---------:| +| ResNet34-TSTP-emb256 | 6.63M | Cosine | 15.4 | 19.82 | 10.39 | +| | | PLDA | 11.689 | 16.961 | 6.239 | +| | | Adapt PLDA | 5.788 | 8.974 | 2.674 | -With PLDA training data augmentation: -| Model | Params | Backend | Pooled | Tagalog | Cantonese | -|:------|:------:|:------------:|:------------:|:------------:|:------------:| -| ResNet34-TSTP-emb256 | 6.63M | Cosine | 15.4 | 19.82 | 10.39 | -| | | PLDA | 8.944 | 13.54 | 4.462 | -| | | Adapt PLDA | 6.543 | 9.666 | 3.254 | +Current PLDA implementation is fully compatible with the Kaldi version, note that +we can definitely improve the results with out adaptation with parameter tuning and extra LDA as shown in the Kaldi +Recipe, we didn't do this because we focus more on the adapted results, which are good enough under current setup. -* 🔥 UPDATE 2023.07.14: Support the [NIST SRE16 recipe](https://www.nist.gov/itl/iad/mig/speaker-recognition-evaluation-2016), see [#177](https://github.com/wenet-e2e/wespeaker/pull/177). +* 🔥 UPDATE 2023.07.18: Support kaldi-compatible two-covariance PLDA and unsupervised domain adaptation. +* 🔥 UPDATE 2023.07.14: Support + the [NIST SRE16 recipe](https://www.nist.gov/itl/iad/mig/speaker-recognition-evaluation-2016), + see [#177](https://github.com/wenet-e2e/wespeaker/pull/177). diff --git a/examples/sre/v2/__init__.py b/examples/sre/v2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/sre/v2/local/score_plda.sh b/examples/sre/v2/local/score_plda.sh new file mode 100644 index 00000000..da09fe61 --- /dev/null +++ b/examples/sre/v2/local/score_plda.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Copyright (c) 2023 Shuai Wang (wsstriving@gmail.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +exp_dir= +trials="trials trials_tgl trials_yue" +data=data +aug_plda_data=0 + +stage=-1 +stop_stage=-1 + +. tools/parse_options.sh +. path.sh + +if [ $aug_plda_data = 0 ];then + sre_plda_data=sre +else + sre_plda_data=sre_aug +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "train the plda model ..." + python wespeaker/bin/train_plda.py \ + --exp_dir ${exp_dir} \ + --scp_path ${exp_dir}/embeddings/${sre_plda_data}/xvector.scp \ + --utt2spk ${data}/${sre_plda_data}/utt2spk \ + --indim 256 \ + --iter 10 + echo "plda training finished" +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "adapt the plda model ..." + python wespeaker/bin/adapt_plda.py \ + -mo ${exp_dir}/plda \ + -ma ${exp_dir}/plda_adapt \ + -ad ${exp_dir}/embeddings/sre16_major/xvector.scp \ + -ws 0.75 \ + -as 0.25 + echo "plda adapted finished" +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "apply plda scoring ..." + mkdir -p ${exp_dir}/scores + trials_dir=${data}/trials + for x in $trials; do + echo "scoring on " $x + python wespeaker/bin/eval_plda.py \ + --enroll_scp_path ${exp_dir}/embeddings/sre16_eval_enroll/xvector.scp \ + --test_scp_path ${exp_dir}/embeddings/sre16_eval_test/xvector.scp \ + --indomain_scp ${exp_dir}/embeddings/sre16_major/xvector.scp \ + --utt2spk data/sre16_eval_enroll/utt2spk \ + --trial ${trials_dir}/${x} \ + --score_path ${exp_dir}/scores/${x}.pldascore \ + --model_path ${exp_dir}/plda_adapt + done +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "compute metrics (EER/minDCF) ..." + scores_dir=${exp_dir}/scores + for x in $trials; do + python wespeaker/bin/compute_metrics.py \ + --p_target 0.01 \ + --c_fa 1 \ + --c_miss 1 \ + ${scores_dir}/${x}.pldascore \ + 2>&1 | tee -a ${scores_dir}/sre16_plda_result + + echo "compute DET curve ..." + python wespeaker/bin/compute_det.py \ + ${scores_dir}/${x}.pldascore + done +fi diff --git a/wespeaker/__init__.py b/wespeaker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wespeaker/bin/adapt_plda.py b/wespeaker/bin/adapt_plda.py new file mode 100644 index 00000000..3683cc1e --- /dev/null +++ b/wespeaker/bin/adapt_plda.py @@ -0,0 +1,58 @@ +# Copyright (c) 2023 Brno University of Technology +# Shuai Wang (wsstriving@gmail.com) +# +# Python implementation of Kaldi unsupervised PLDA adaptation +# ( https://github.com/kaldi-asr/kaldi/blob/master/src/ivector/plda.cc#L613 ) +# by Daniel Povey. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse + +from wespeaker.utils.plda.two_cov_plda import TwoCovPLDA + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--adp_scp', '-ad', + type=str, + required=True, + help='Data for unlabeled adaptation.') + parser.add_argument('--across_class_scale', '-as', + type=float, + help='Scaling factor for across class covariance.', + default=0.5) + parser.add_argument('--within_class_scale', '-ws', + type=float, + help='Scaling factor for withn class covariance.', + default=0.5) + parser.add_argument('--mdl_org', '-mo', + type=str, + required=True, + help='Original PLDA mdl.') + parser.add_argument('--mdl_adp', '-ma', + type=str, + required=True, + help='Adapted PLDA mdl.') + parser.add_argument('--mdl_format', '-mf', + type=str, + default='wespeaker', + help='Format of the model wespeaker/kaldi') + + args = parser.parse_args() + + kaldi_format = True if args.mdl_format == 'kaldi' else False + plda = TwoCovPLDA.load_model(args.mdl_org, kaldi_format) + adapt_plda = plda.adapt(args.adp_scp, args.across_class_scale, args.within_class_scale) + adapt_plda.save_model(args.mdl_adp) diff --git a/wespeaker/bin/eval_plda.py b/wespeaker/bin/eval_plda.py index 459b0364..3f076d90 100644 --- a/wespeaker/bin/eval_plda.py +++ b/wespeaker/bin/eval_plda.py @@ -13,7 +13,7 @@ # limitations under the License. import argparse -import os + from wespeaker.utils.plda.two_cov_plda import TwoCovPLDA if __name__ == '__main__': @@ -21,18 +21,17 @@ parser.add_argument('--type', type=str, default='2cov', - help='which type of plda to use') - parser.add_argument('--enroll_scp_path', type=str) - parser.add_argument('--test_scp_path', type=str) - parser.add_argument('--utt2spk', type=str) - parser.add_argument('--exp_dir', type=str) - parser.add_argument('--trial', type=str) + help='which type of plda to use, 2cov|kaldi') + parser.add_argument('--enroll_scp_path', type=str, help='enroll embeddings') + parser.add_argument('--indomain_scp_path', type=str, help='embeddings to compute meanvec') + parser.add_argument('--test_scp_path', type=str, help='test embeddings') + parser.add_argument('--utt2spk', type=str, help='utt2spk for the enroll speakers') + parser.add_argument('--model_path', type=str, help='pretrained plda path') + parser.add_argument('--score_path', type=str, help='score file to write to') + parser.add_argument('--trial', type=str, help='trial file to score upon') args = parser.parse_args() - if args.type == '2cov': - model_path = os.path.join(args.exp_dir, '2cov.plda') - score_path = os.path.join(args.exp_dir, 'scores', - os.path.basename(args.trial) + '.pldascore') - plda = TwoCovPLDA.load_model(model_path) - plda.eval_sv(args.enroll_scp_path, args.utt2spk, args.test_scp_path, - args.trial, score_path) + kaldi_format = True if args.type == 'kaldi' else False + plda = TwoCovPLDA.load_model(args.model_path, kaldi_format) + plda.eval_sv(args.enroll_scp_path, args.utt2spk, args.test_scp_path, + args.trial, args.score_path, args.indomain_scp_path) diff --git a/wespeaker/bin/train_plda.py b/wespeaker/bin/train_plda.py index c3795ab3..c924b03e 100644 --- a/wespeaker/bin/train_plda.py +++ b/wespeaker/bin/train_plda.py @@ -14,6 +14,7 @@ import argparse import os + from wespeaker.utils.plda.two_cov_plda import TwoCovPLDA if __name__ == '__main__': @@ -25,7 +26,7 @@ parser.add_argument('--type', type=str, default='2cov', - help='which type of plda to use') + help='which type of plda to use, we only support kaldi 2cov version currently') parser.add_argument('--scp_path', type=str, help='the plda training embedding.scp file') @@ -42,5 +43,5 @@ utt2spk_file=args.utt2spk, embed_dim=args.indim) plda.train(args.iter) - model_path = os.path.join(args.exp_dir, '2cov.plda') + model_path = os.path.join(args.exp_dir, 'plda') plda.save_model(model_path) diff --git a/wespeaker/utils/plda/kaldi_utils.py b/wespeaker/utils/plda/kaldi_utils.py new file mode 100644 index 00000000..9410ff5f --- /dev/null +++ b/wespeaker/utils/plda/kaldi_utils.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python + +# Copyright 2019 Lukas Burget (burget@fit.vutbr.cz) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import struct + +import numpy as np +from kaldi_io import open_or_fd, BadSampleSize, UnknownMatrixHeader +from kaldi_io.kaldi_io import _read_compressed_mat, _read_mat_ascii + + +def read_plda(file_or_fd): + """ Loads PLDA from a file in kaldi format (binary or text). + Input: + file_or_fd - file name or file handle with kaldi PLDA model. + Output: + Tuple (mu, tr, psi) define a PLDA model using the kaldi parametrization + mu : mean vector + tr : transform whitening within- and diagonalizing across-class + covariance matrix + psi - diagonal of the across-class covariance in the transformed space + """ + fd = open_or_fd(file_or_fd) + try: + binary = fd.read(2) + if binary == b'\x00B': + assert (fd.read(7) == b' ') + plda_mean = _read_vec_binary(fd) + plda_trans = _read_mat_binary(fd) + plda_psi = _read_vec_binary(fd) + else: + assert (binary + fd.read(5) == b' ') + plda_mean = np.array(fd.readline().strip(' \n[]').split(), + dtype=float) + assert (fd.read(2) == b' [') + plda_trans = _read_mat_ascii(fd) + plda_psi = np.array(fd.readline().strip(' \n[]').split(), + dtype=float) + assert (fd.read(8) == b' ') + finally: + if fd is not file_or_fd: + fd.close() + return plda_mean, plda_trans, plda_psi + + +def _read_vec_binary(fd): + # Data type, + type = fd.read(3) + if type == b'FV ': + sample_size = 4 # floats + elif type == b'DV ': + sample_size = 8 # doubles + else: + raise BadSampleSize + assert (sample_size > 0) + # Dimension, + assert fd.read(1) == b'\4' # int-size + vec_size = struct.unpack(' 0) + # Dimensions + s1, rows, s2, cols = \ + np.frombuffer(fd.read(10), dtype='int8,int32,int8,int32', count=1)[0] + # Read whole matrix + buf = fd.read(rows * cols * sample_size) + if sample_size == 4: + vec = np.frombuffer(buf, dtype='float32') + elif sample_size == 8: + vec = np.frombuffer(buf, dtype='float64') + else: + raise BadSampleSize + mat = np.reshape(vec, (rows, cols)) + return mat + + +def _read_sparse_mat(fd, format): + """ Read a sparse matrix, + """ + from scipy.sparse import csr_matrix + assert (format == 'SM ') + + # Mapping for matrix elements, + def read_sparse_vector(fd): + _format = fd.read(3).decode() + assert (_format == 'SV ') + _, dim = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + _, num_elems = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + col = [] + data = [] + for j in range(num_elems): + size = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] + dtype = 'int32' if size == 4 else 'int64' + c = np.frombuffer(fd.read(size), dtype=dtype, count=1)[0] + size = np.frombuffer(fd.read(1), dtype='int8', count=1)[0] + dtype = 'float32' if size == 4 else 'float64' + d = np.frombuffer(fd.read(size), dtype=dtype, count=1)[0] + col.append(c) + data.append(d) + return col, data, dim + + _, num_rows = np.frombuffer(fd.read(5), dtype='int8,int32', count=1)[0] + + rows = [] + cols = [] + all_data = [] + max_dim = 0 + for i in range(num_rows): + col, data, dim = read_sparse_vector(fd) + rows += [i] * len(col) + cols += col + all_data += data + max_dim = max(dim, max_dim) + sparse_mat = csr_matrix((all_data, (rows, cols)), shape=(num_rows, max_dim)) + return sparse_mat diff --git a/wespeaker/utils/plda/plda_utils.py b/wespeaker/utils/plda/plda_utils.py index 81f16126..b1ad496c 100644 --- a/wespeaker/utils/plda/plda_utils.py +++ b/wespeaker/utils/plda/plda_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 Shuai Wang (wsstriving@gmail.com) +# Copyright (c) 2023 Shuai Wang (wsstriving@gmail.com) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import math -import numpy as np import kaldiio +import numpy as np def read_vec_scp_file(scp_file): @@ -42,12 +43,19 @@ def read_label_file(label_file): return labels_dict -def norm_embeddings(embeddings): +def norm_embeddings(embeddings, kaldi_style=True): + """ + Norm embeddings to unit length + :param embeddings: input embeddings + :param kaldi_style: if true, the norm should be embedding dimension + :return: + """ + scale = math.sqrt(embeddings.shape[-1]) if kaldi_style else 1. if len(embeddings.shape) == 2: - return (embeddings.transpose() / + return (scale * embeddings.transpose() / np.linalg.norm(embeddings, axis=1)).transpose() elif len(embeddings.shape) == 1: - return embeddings / np.linalg.norm(embeddings) + return scale * embeddings / np.linalg.norm(embeddings) def get_data_for_plda(scp_file, utt2spk_file): @@ -63,3 +71,26 @@ def get_data_for_plda(scp_file, utt2spk_file): else: model_dict[label] = [vec] return np.vstack(samples), model_dict + + +def compute_normalizing_transform(covar): + """ + :param covar: + :return: + """ + c = np.linalg.cholesky(covar) + c = np.linalg.inv(c) + return c + + +def sort_svd(s, d): + """ + :param s: + :param d: + :return: + """ + idx = np.argsort(-s) + s1 = s[idx] + d1 = d.T + d1 = d1[idx].T + return s1, d1 diff --git a/wespeaker/utils/plda/two_cov_plda.py b/wespeaker/utils/plda/two_cov_plda.py index e504c83f..e9163612 100644 --- a/wespeaker/utils/plda/two_cov_plda.py +++ b/wespeaker/utils/plda/two_cov_plda.py @@ -1,5 +1,5 @@ # Copyright (c) 2022 Shuai Wang (wsstriving@gmail.com) -# +# 2023 Shuai Wang, Houjun Huang # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,129 +12,272 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math + import h5py import numpy as np +import scipy.linalg as spl from numpy.linalg import inv from tqdm import tqdm -from wespeaker.utils.plda.plda_utils import norm_embeddings +from wespeaker.utils.plda.kaldi_utils import read_plda + +from wespeaker.utils.plda.plda_utils import compute_normalizing_transform from wespeaker.utils.plda.plda_utils import get_data_for_plda +from wespeaker.utils.plda.plda_utils import norm_embeddings from wespeaker.utils.plda.plda_utils import read_vec_scp_file +from wespeaker.utils.plda.plda_utils import sort_svd +M_LOG_2PI = 1.8378770664093454835606594728112 -class TwoCovPLDA: - def __init__(self, scp_file=None, utt2spk_file=None, embed_dim=256): - if scp_file is not None: - self.embeddings, self.embeddings_dict = \ - get_data_for_plda(scp_file, utt2spk_file) +class ClassInfo(object): + def __init__(self, weight=0, num_example=0, mu=0): + self.weight = weight + self.num_example = num_example + self.mu = mu + - self.embeddings = norm_embeddings(self.embeddings) - self.N = len(self.embeddings) - self.mu = self.embeddings.mean(0) - self.S = np.zeros([embed_dim, embed_dim]) - self.B = np.zeros([embed_dim, embed_dim]) - self.W = np.zeros([embed_dim, embed_dim]) +class PldaStats(object): + def __init__(self, dim): + self.dim = dim + self.num_example, self.num_classes = 0, 0 + self.class_weight, self.example_weight = 0, 0, + self.sum_, self.offset_scatter = np.zeros(dim), np.zeros((dim, dim)) + self.classinfo = [] + def add_samples(self, weight, spk_embeddings): + """ + Add samples of a certain speaker to the PLDA stats + :param weight: class_weight, default set to 1. + :param spk_embeddings: All embedding samples from a certain speaker + :return: + """ + n = spk_embeddings.shape[0] + mean = np.mean(spk_embeddings, axis=0) + tmp = spk_embeddings - mean + self.offset_scatter += weight * np.matmul(tmp.T, tmp) + self.classinfo.append(ClassInfo(weight, n, mean)) + self.num_example += n + self.num_classes += 1 + self.class_weight += weight + self.example_weight += weight * n + self.sum_ += weight * mean + + +class TwoCovPLDA: + def __init__(self, scp_file=None, utt2spk_file=None, embed_dim=256, + normalize_length=True): + self.normalize_length = normalize_length + self.dim = embed_dim + self.mu = np.zeros(self.dim) + # The transform which whitens the within- and + # diagonalizes the across-class covariance matrix + self.transform = np.zeros((self.dim, self.dim)) + # The diagonal of the across-class covariance in the transformed space + self.psi = np.zeros(self.dim) + self.offset = np.zeros(self.dim) + self.stats = PldaStats(self.dim) + self.B = np.eye(self.dim) + self.B_stats = np.zeros((self.dim, self.dim)) + self.B_count = 0 + self.W = np.eye(self.dim) + self.W_stats = np.zeros((self.dim, self.dim)) + self.W_count = 0 + if scp_file is not None: + samples, self.embeddings_dict = get_data_for_plda(scp_file, + utt2spk_file) + train_mean_vec = samples.mean(0) for key, mat in self.embeddings_dict.items(): - # iterate over the i-th speaker mat = np.vstack(mat) + mat = mat - train_mean_vec mat = norm_embeddings(mat) - self.embeddings_dict[key] = mat - mui = mat.mean(0) - Si = mat.T.dot(mat) - self.S += Si - self.W += (mat - mui).T.dot(mat - mui) - self.B += np.outer(mui, mui) + self.stats.add_samples(1.0, mat) + self.mu = self.stats.sum_ / self.stats.class_weight - self.W /= self.N - self.B /= self.N - # self.embeddings = self.embeddings - self.mu + def train(self, num_em_iters): + for i in range(num_em_iters): + print("Plda estimation %d of %d" % (i, num_em_iters)) + self.em_one_iter() + self.get_output() - self.embed_dim = embed_dim + def em_one_iter(self): + self.B_stats, self.B_count = np.zeros( + (self.stats.dim, self.stats.dim)), 0 + self.W_stats, self.W_count = np.zeros( + (self.stats.dim, self.stats.dim)), 0 + self.W_stats += self.stats.offset_scatter + self.W_count += self.stats.example_weight - self.stats.class_weight + B_inv = inv(self.B) + W_inv = inv(self.W) + for i in range(self.stats.num_classes): + info = self.stats.classinfo[i] + m = info.mu - self.stats.sum_ / self.stats.class_weight + weight = info.weight + n = info.num_example + mix_var = inv(B_inv + n * W_inv) + w = np.matmul(mix_var, n * np.matmul(W_inv, m)) + m_w = m - w + self.B_stats += weight * (mix_var + np.outer(w, w)) + self.B_count += weight + self.W_stats += weight * n * (mix_var + np.outer(m_w, m_w)) + self.W_count += weight - def train(self, num_iters): - """ - Implementation following paper - Unifying Probabilistic Linear Discriminant Analysis - Variants in Biometric Authentication - """ - embed_dim = self.embed_dim + self.W = self.W_stats / self.W_count + self.B = self.B_stats / self.B_count + self.W = 0.5 * (self.W + self.W.T) + self.B = 0.5 * (self.B + self.B.T) - T = np.zeros([embed_dim, embed_dim]) - R = np.zeros([embed_dim, embed_dim]) - Y = np.zeros(embed_dim) + print("W_count:", self.W_count, "Trace of W:", np.trace(self.W)) + print("B_count:", self.B_count, "Trace of B:", np.trace(self.B)) - for iteration in range(1, num_iters + 1): - print("iteration: ", iteration) - for key, mat in self.embeddings_dict.items(): - embeddings = mat.T - # E-step - ni = len(mat) - Li = inv(self.B + ni * self.W) - Fi = embeddings.sum(1) - gamma = self.B.dot(self.mu) + self.W.dot(Fi) - Ey = Li.dot(gamma) - Eyy = np.outer(Ey, Ey) + Li - - # M-step - T = T + np.outer(Ey, Fi) - R = R + ni * Eyy - Y = Y + ni * Ey - - # Update the parameters - self.mu = Y / self.N - self.B = (R - (np.outer(self.mu, Y) + np.outer(Y, self.mu))) / \ - self.N + np.outer(self.mu, self.mu) - self.B = inv(self.B) - self.W = inv((self.S - (T + T.T) + R) / self.N) - - def eval_sv(self, enroll_scp, enroll_utt2spk, test_scp, trials, - score_file): + def get_output(self): + self.mu = self.stats.sum_ / self.stats.class_weight + transform1 = compute_normalizing_transform(self.W) + B_proj = np.matmul(transform1, self.B) + B_proj = np.matmul(B_proj, transform1.T) + s, U = np.linalg.eigh(B_proj) + s = np.where(s > 0.0, s, 0.0) + s, U = sort_svd(s, U) + + self.transform = np.matmul(U.T, transform1) + self.psi = s + self.offset = np.zeros(self.dim) + self.offset = -1.0 * np.matmul(self.transform, self.mu) + + def transform_embedding(self, embedding): + transformed_embedding = np.matmul(self.transform, embedding) + transformed_embedding += self.offset + normalization_factor = math.sqrt(self.dim) / np.linalg.norm( + transformed_embedding) + if self.normalize_length: + transformed_embedding = normalization_factor * transformed_embedding + return transformed_embedding + + def log_likelihood_ratio(self, transformed_train_embedding, + transformed_test_embedding): + mean = self.psi / (self.psi + 1.0) * transformed_train_embedding + variance = 1.0 + self.psi / (self.psi + 1.0) + logdet = np.sum(np.log(variance)) + sqdiff = transformed_test_embedding - mean + sqdiff = np.power(sqdiff, 2.0) + variance = 1.0 / variance + loglike_given_class = -0.5 * ( + logdet + M_LOG_2PI * self.dim + np.dot(sqdiff, variance)) + sqdiff = transformed_test_embedding + sqdiff = np.power(sqdiff, 2.0) + variance = self.psi + 1.0 + logdet = np.sum(np.log(variance)) + variance = 1.0 / variance + loglike_without_class = -0.5 * ( + logdet + M_LOG_2PI * self.dim + np.dot(sqdiff, variance)) + loglike_ratio = loglike_given_class - loglike_without_class + return loglike_ratio + + def eval_sv(self, enroll_scp, enroll_utt2spk, test_scp, trials, score_file, + indomain_scp=None): """ - Implementations follows - Analysis of I-vector Length Normalization in Speaker Recognition Systems - This function is designed for SV task + Caculate the plda score + :param enroll_scp: + :param enroll_utt2spk: + :param test_scp: + :param trials: + :param score_file: + :param indomain_scp: + :return: """ _, enroll_embeddings_dict = get_data_for_plda(enroll_scp, enroll_utt2spk) test_embeddings_dict = read_vec_scp_file(test_scp) - Stot = inv(self.W) + inv(self.B) - Sac = inv(self.B) - invStot = inv(Stot) - tmp = inv(Stot - Sac.dot(invStot.dot(Sac))) - Q = invStot - tmp # 256 * 256 - P = invStot.dot(Sac).dot(tmp) # 256 * 256 + if indomain_scp is not None: + indomain_embeddings_dict = read_vec_scp_file(indomain_scp) + mean_vec = np.vstack( + list(indomain_embeddings_dict.values()) + ).mean(0) + else: + mean_vec = np.zeros(self.dim) enrollspks = {} testspks = {} for key, value in enroll_embeddings_dict.items(): + value = np.vstack(value) + value = value - mean_vec # Shuai tmp = norm_embeddings(np.mean(value, 0)) - tmp = tmp - self.mu - tmp.reshape(len(tmp), 1) # 256 * 1 - enrollspks[key] = tmp.T.dot(Q).dot(tmp) - enroll_embeddings_dict[key] = P.dot(tmp) + tmp = self.transform_embedding(tmp) + enrollspks[key] = tmp for key, value in test_embeddings_dict.items(): - tmp = norm_embeddings(value) - self.mu - tmp.reshape(len(tmp), 1) # 256 * 1 - testspks[key] = tmp.T.dot(Q).dot(tmp) - test_embeddings_dict[key] = tmp + value = value - mean_vec # Shuai + tmp = norm_embeddings(value) + tmp = self.transform_embedding(tmp) + testspks[key] = tmp with open(score_file, 'w') as write_score: with open(trials, 'r') as read_trials: for line in tqdm(read_trials): tokens = line.strip().split() - score = testspks[tokens[1]] + enrollspks[tokens[0]] + \ - 2.0 * test_embeddings_dict[tokens[1]].T.dot( - enroll_embeddings_dict[tokens[0]]) + score = self.log_likelihood_ratio( + enrollspks[tokens[0]], + testspks[tokens[1]]) segs = line.strip().split() - output_line = ('{} {} {:.5f} {}\n'.format( - segs[0], segs[1], score, segs[2])) + output_line = ( + '{} {} {:.5f} {}\n'.format(segs[0], segs[1], score, + segs[2])) write_score.write(output_line) + def adapt(self, adapt_scp, ac_scale=0.5, wc_scale=0.5): + # Implemented by the BUT speech group + # plda = load_model(model_path, from_kaldi=from_kaldi) + adp_data = np.array(list(read_vec_scp_file(adapt_scp).values())) + mean_vec = adp_data.mean(0) + adp_data = adp_data - mean_vec + adp_data = norm_embeddings(adp_data) + + plda_mean, plda_trans, plda_psi = self.mu, self.transform, self.psi + W = inv(plda_trans.T.dot(plda_trans)) + W = (W + W.T) / 2 + B = np.linalg.inv((plda_trans.T / plda_psi).dot(plda_trans)) + B = (B + B.T) / 2 + T = B + W + # adp_data = np.vstack(self.xvect) + # Covariance of the adaptation data. + data_cov = np.cov(adp_data.T) + [v, e] = spl.eigh(data_cov, (T + T.T) / 2) + iet = np.linalg.inv(e.T) + excess = iet[:, v > 1].dot(np.diag(np.sqrt(v[v > 1] - 1))) + V_adp = excess * np.sqrt(ac_scale) + B_adp = B + V_adp.dot(V_adp.T) + U_adp = excess * np.sqrt(wc_scale) + W_adp = W + U_adp.dot(U_adp.T) + mu_adp = np.mean(adp_data, axis=0) + mu, A, B = mu_adp, (B_adp + B_adp.T) / 2.0, (W_adp + W_adp.T) / 2.0 + eps = 1e-9 + [D, V] = np.linalg.eigh(B) + D = np.diag(1.0 / np.sqrt(D + eps)) + # First transform + T1 = np.dot(D, V.T) + # This should equal the identity matrix + B1 = np.dot(np.dot(T1, B), T1.T) + A1 = np.dot(np.dot(T1, A), T1.T) + # Second transform is given by T2.T * (.) * T2 + [D, T2] = np.linalg.eigh(A1) + # Joint transform + T = np.dot(T2.T, T1) + # Transform the matrices + A2 = np.dot(np.dot(T, A), T.T) + B2 = np.dot(np.dot(T, B), T.T) + plda_trans, plda_psi, X = T, np.diag(A2), B2 + + adapt_plda = TwoCovPLDA() + adapt_plda.mu = mu + adapt_plda.transform = plda_trans + adapt_plda.psi = plda_psi + adapt_plda.offset = -1.0 * np.matmul(adapt_plda.transform, + adapt_plda.mu) + + return adapt_plda + def save_model(self, output_file_name): - # assert self.validate(), "Error: wrong PLDA model format" print("saving the trained plda to {}".format(output_file_name)) with h5py.File(output_file_name, "w") as f: f.create_dataset("mu", @@ -142,22 +285,33 @@ def save_model(self, output_file_name): maxshape=(None), compression="gzip", fletcher32=True) - f.create_dataset("B", - data=self.B, + f.create_dataset("transform", + data=self.transform, maxshape=(None, None), compression="gzip", fletcher32=True) - f.create_dataset("W", - data=self.W, - maxshape=(None, None), + f.create_dataset("psi", + data=self.psi, + maxshape=(None), + compression="gzip", + fletcher32=True) + f.create_dataset("offset", + data=self.offset, + maxshape=(None), compression="gzip", fletcher32=True) @staticmethod - def load_model(model_name): - with h5py.File(model_name, "r") as f: - plda = TwoCovPLDA() - plda.mu = f.get("mu")[()] - plda.B = f.get("B")[()] - plda.W = f.get("W")[()] - return plda + def load_model(model_name, from_kaldi=False): + plda = TwoCovPLDA() + if from_kaldi: + plda.mu, plda.transform, plda.psi = read_plda(model_name) + plda.offset = np.zeros(plda.mu.shape[0]) + plda.offset = -1.0 * np.matmul(plda.transform, plda.mu) + else: + with h5py.File(model_name, "r") as f: + plda.mu = f.get("mu")[()] + plda.transform = f.get("transform")[()] + plda.psi = f.get("psi")[()] + plda.offset = f.get("offset")[()] + return plda diff --git a/wespeaker/utils/utils.py b/wespeaker/utils/utils.py index a784b129..4514102d 100644 --- a/wespeaker/utils/utils.py +++ b/wespeaker/utils/utils.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import logging -import torch -import numpy as np +import os import random + +import numpy as np +import torch import yaml From 5862d2be1796ce2bf00b90bb743e30678a03cc7e Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Tue, 18 Jul 2023 21:34:34 +0800 Subject: [PATCH 2/9] update the plda code to run.sh --- README.md | 2 +- examples/sre/v2/run.sh | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 360804ff..ba5bd90c 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ pip3 install wespeakerruntime ``` ## 🔥 News -* 2023.07.18: Support the kaldi-compatible PLDA and unsupervised adaptation, see [#178](https://github.com/wenet-e2e/wespeaker/pull/178). +* 2023.07.18: Support the kaldi-compatible PLDA and unsupervised adaptation, see [#178](https://github.com/wenet-e2e/wespeaker/pull/186). * 2023.07.14: Support the [NIST SRE16 recipe](https://www.nist.gov/itl/iad/mig/speaker-recognition-evaluation-2016), see [#177](https://github.com/wenet-e2e/wespeaker/pull/177). * 2023.07.10: Support the [Self-Supervised Learning recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v3) on Voxceleb, including [DINO](https://openaccess.thecvf.com/content/ICCV2021/papers/Caron_Emerging_Properties_in_Self-Supervised_Vision_Transformers_ICCV_2021_paper.pdf), [MoCo](https://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf) and [SimCLR](http://proceedings.mlr.press/v119/chen20j/chen20j.pdf), see [#180](https://github.com/wenet-e2e/wespeaker/pull/180). diff --git a/examples/sre/v2/run.sh b/examples/sre/v2/run.sh index 810b48b2..86779c43 100755 --- a/examples/sre/v2/run.sh +++ b/examples/sre/v2/run.sh @@ -106,7 +106,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - echo "Score ..." + echo "Score using Cosine Distance..." local/score.sh \ --stage 1 --stop-stage 2 \ --data ${data} \ @@ -115,6 +115,16 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + echo "Score with adapted PLDA ..." + local/score_plda.sh \ + --stage 1 --stop-stage 4 \ + --data ${data} \ + --exp_dir $exp_dir \ + --aug_plda_data ${aug_plda_data} \ + --trials "$trials" +fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then echo "Export the best model ..." python wespeaker/bin/export_jit.py \ --config $exp_dir/config.yaml \ From 89d4fa06da9e1de0d8454fd8630462c6244eb567 Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Tue, 18 Jul 2023 21:36:23 +0800 Subject: [PATCH 3/9] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ba5bd90c..2ea73a1e 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ pip3 install wespeakerruntime ``` ## 🔥 News -* 2023.07.18: Support the kaldi-compatible PLDA and unsupervised adaptation, see [#178](https://github.com/wenet-e2e/wespeaker/pull/186). +* 2023.07.18: Support the kaldi-compatible PLDA and unsupervised adaptation, see [#186](https://github.com/wenet-e2e/wespeaker/pull/186). * 2023.07.14: Support the [NIST SRE16 recipe](https://www.nist.gov/itl/iad/mig/speaker-recognition-evaluation-2016), see [#177](https://github.com/wenet-e2e/wespeaker/pull/177). * 2023.07.10: Support the [Self-Supervised Learning recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v3) on Voxceleb, including [DINO](https://openaccess.thecvf.com/content/ICCV2021/papers/Caron_Emerging_Properties_in_Self-Supervised_Vision_Transformers_ICCV_2021_paper.pdf), [MoCo](https://openaccess.thecvf.com/content_CVPR_2020/papers/He_Momentum_Contrast_for_Unsupervised_Visual_Representation_Learning_CVPR_2020_paper.pdf) and [SimCLR](http://proceedings.mlr.press/v119/chen20j/chen20j.pdf), see [#180](https://github.com/wenet-e2e/wespeaker/pull/180). From 523157be5fbe941b6af7ced7ad400570461adfc2 Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Tue, 18 Jul 2023 23:36:22 +0800 Subject: [PATCH 4/9] reformat code --- ROADMAP.md | 41 ++++++++++++++--------------- wespeaker/bin/adapt_plda.py | 10 +++---- wespeaker/bin/average_model.py | 3 ++- wespeaker/bin/compute_metrics.py | 4 ++- wespeaker/bin/eval_plda.py | 6 +++-- wespeaker/bin/export_jit.py | 2 +- wespeaker/bin/export_onnx.py | 12 ++++++--- wespeaker/bin/extract.py | 3 ++- wespeaker/bin/extract_deprecated.py | 4 ++- wespeaker/bin/infer_onnx.py | 5 ++-- wespeaker/bin/score.py | 7 ++--- wespeaker/bin/score_norm.py | 16 ++++++----- wespeaker/bin/train.py | 22 +++++++++------- wespeaker/bin/train_deprecated.py | 22 +++++++++------- 14 files changed, 88 insertions(+), 69 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index 2d37d986..52b2dedc 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,31 +1,29 @@ # Wespeaker Roadmap - ## Version 2.0 (Time: 2023.09) -This is the roadmap for wespeaker version 2.0. +This is the roadmap for wespeaker version 2.0. - [ ] SSL support - - [ ] Algorithms - - [x] DINO - - [x] MOCO - - [x] SimCLR - - [ ] Iteratively psudo label prediction and supervised finetuning - - [ ] Recipes - - [x] VoxCeleb - - [ ] WenetSpeech - + - [ ] Algorithms + - [x] DINO + - [x] MOCO + - [x] SimCLR + - [ ] Iteratively psudo label prediction and supervised finetuning + - [ ] Recipes + - [x] VoxCeleb + - [ ] WenetSpeech + - [ ] Recipes - - [ ] 3D-speaker - - [ ] NIST SRE - - [x] SRE16 - - [ ] SRE18 - + - [ ] 3D-speaker + - [ ] NIST SRE + - [x] SRE16 + - [ ] SRE18 + ## Version 1.0 (Time: 2022.09) This is the roadmap for wespeaker version 1.0. - - [x] Standard dataset support - [x] VoxCeleb - [x] CnCeleb @@ -40,13 +38,14 @@ This is the roadmap for wespeaker version 1.0. - [x] PLDA - [x] UIO for effective industrial-scale dataset processing - [x] Online data augmentation - - Noise && RIR - - Speed Perturb - - Specaug + - Noise && RIR + - Speed Perturb + - Specaug - [x] ONNX support - [x] Triton Server support (GPU) - [ ] ~~ - - Training or finetuning big models such as WavLM might be too costly for current stage + - Training or finetuning big models such as WavLM might be too costly for + current stage - [x] Basic Speaker Diarization Recipe - Embedding based (more related with our speaker embedding learner toolkit) - [x] Interactive Demo diff --git a/wespeaker/bin/adapt_plda.py b/wespeaker/bin/adapt_plda.py index 3683cc1e..3d99baa5 100644 --- a/wespeaker/bin/adapt_plda.py +++ b/wespeaker/bin/adapt_plda.py @@ -1,11 +1,10 @@ -# Copyright (c) 2023 Brno University of Technology +# Copyright (c) 2023 Brno University of Technology # Shuai Wang (wsstriving@gmail.com) # -# Python implementation of Kaldi unsupervised PLDA adaptation -# ( https://github.com/kaldi-asr/kaldi/blob/master/src/ivector/plda.cc#L613 ) +# Python implementation of Kaldi unsupervised PLDA adaptation +# ( https://github.com/kaldi-asr/kaldi/blob/master/src/ivector/plda.cc#L613 ) # by Daniel Povey. # -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -54,5 +53,6 @@ kaldi_format = True if args.mdl_format == 'kaldi' else False plda = TwoCovPLDA.load_model(args.mdl_org, kaldi_format) - adapt_plda = plda.adapt(args.adp_scp, args.across_class_scale, args.within_class_scale) + adapt_plda = plda.adapt(args.adp_scp, args.across_class_scale, + args.within_class_scale) adapt_plda.save_model(args.mdl_adp) diff --git a/wespeaker/bin/average_model.py b/wespeaker/bin/average_model.py index 9c689ad0..cc1181d7 100644 --- a/wespeaker/bin/average_model.py +++ b/wespeaker/bin/average_model.py @@ -47,7 +47,8 @@ def get_args(): def main(): args = get_args() - path_list = glob.glob('{}/[!avg][!final][!convert]*.pt'.format(args.src_path)) + path_list = glob.glob( + '{}/[!avg][!final][!convert]*.pt'.format(args.src_path)) path_list = sorted( path_list, key=lambda p: int(re.findall(r"(?<=model_)\d*(?=.pt)", p)[0])) diff --git a/wespeaker/bin/compute_metrics.py b/wespeaker/bin/compute_metrics.py index 33847c0a..f6273792 100644 --- a/wespeaker/bin/compute_metrics.py +++ b/wespeaker/bin/compute_metrics.py @@ -14,8 +14,10 @@ # limitations under the License. import os -import numpy as np + import fire +import numpy as np + from wespeaker.utils.score_metrics import (compute_pmiss_pfa_rbst, compute_eer, compute_c_norm) diff --git a/wespeaker/bin/eval_plda.py b/wespeaker/bin/eval_plda.py index 3f076d90..4b10fe8c 100644 --- a/wespeaker/bin/eval_plda.py +++ b/wespeaker/bin/eval_plda.py @@ -23,9 +23,11 @@ default='2cov', help='which type of plda to use, 2cov|kaldi') parser.add_argument('--enroll_scp_path', type=str, help='enroll embeddings') - parser.add_argument('--indomain_scp_path', type=str, help='embeddings to compute meanvec') + parser.add_argument('--indomain_scp_path', type=str, + help='embeddings to compute meanvec') parser.add_argument('--test_scp_path', type=str, help='test embeddings') - parser.add_argument('--utt2spk', type=str, help='utt2spk for the enroll speakers') + parser.add_argument('--utt2spk', type=str, + help='utt2spk for the enroll speakers') parser.add_argument('--model_path', type=str, help='pretrained plda path') parser.add_argument('--score_path', type=str, help='score file to write to') parser.add_argument('--trial', type=str, help='trial file to score upon') diff --git a/wespeaker/bin/export_jit.py b/wespeaker/bin/export_jit.py index 71fee943..0f5c9e69 100644 --- a/wespeaker/bin/export_jit.py +++ b/wespeaker/bin/export_jit.py @@ -20,8 +20,8 @@ import torch import yaml -from wespeaker.utils.checkpoint import load_checkpoint from wespeaker.models.speaker_model import get_speaker_model +from wespeaker.utils.checkpoint import load_checkpoint def get_args(): diff --git a/wespeaker/bin/export_onnx.py b/wespeaker/bin/export_onnx.py index a6b9007d..269373cf 100644 --- a/wespeaker/bin/export_onnx.py +++ b/wespeaker/bin/export_onnx.py @@ -17,23 +17,26 @@ import argparse +import numpy as np import torch +import torch.nn as nn import yaml -from wespeaker.utils.checkpoint import load_checkpoint from wespeaker.models.speaker_model import get_speaker_model -import torch.nn as nn -import numpy as np +from wespeaker.utils.checkpoint import load_checkpoint + def get_args(): parser = argparse.ArgumentParser(description='export your script model') parser.add_argument('--config', required=True, help='config file') parser.add_argument('--checkpoint', required=True, help='checkpoint model') parser.add_argument('--output_model', required=True, help='output file') - parser.add_argument('--mean_vec', required=False, default=None, help='mean vector') + parser.add_argument('--mean_vec', required=False, default=None, + help='mean vector') args = parser.parse_args() return args + def main(): args = get_args() @@ -92,5 +95,6 @@ def forward(self, feats): # --fp16 # If it is an model with QDQ nodes, please add --int8 + if __name__ == '__main__': main() diff --git a/wespeaker/bin/extract.py b/wespeaker/bin/extract.py index 89d47c4c..1f07ef5d 100644 --- a/wespeaker/bin/extract.py +++ b/wespeaker/bin/extract.py @@ -13,11 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import os + import fire import kaldiio import torch -import copy from torch.utils.data import DataLoader from tqdm import tqdm diff --git a/wespeaker/bin/extract_deprecated.py b/wespeaker/bin/extract_deprecated.py index bf2e3a8c..78fa0fd8 100644 --- a/wespeaker/bin/extract_deprecated.py +++ b/wespeaker/bin/extract_deprecated.py @@ -13,6 +13,7 @@ # limitations under the License. import os + import fire import kaldiio import torch @@ -69,7 +70,8 @@ def extract(config='conf/config.yaml', **kwargs): with torch.no_grad(): with kaldiio.WriteHelper('ark,scp:' + embed_ark + "," + embed_scp) as writer: - t_bar = tqdm(ncols=100, total=len(dataloader), desc='extract_embed: ') + t_bar = tqdm(ncols=100, total=len(dataloader), + desc='extract_embed: ') for i, (utts, feats, _) in enumerate(dataloader): t_bar.update() diff --git a/wespeaker/bin/infer_onnx.py b/wespeaker/bin/infer_onnx.py index db158076..2e94a26e 100644 --- a/wespeaker/bin/infer_onnx.py +++ b/wespeaker/bin/infer_onnx.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch -import torchaudio import argparse + import onnxruntime as ort +import torch +import torchaudio import torchaudio.compliance.kaldi as kaldi diff --git a/wespeaker/bin/score.py b/wespeaker/bin/score.py index a857ae31..13025711 100644 --- a/wespeaker/bin/score.py +++ b/wespeaker/bin/score.py @@ -13,12 +13,13 @@ # limitations under the License. import os -import kaldiio -from tqdm import tqdm -import numpy as np from pathlib import Path + import fire +import kaldiio +import numpy as np from sklearn.metrics.pairwise import cosine_similarity +from tqdm import tqdm def calculate_mean_from_kaldi_vec(scp_path): diff --git a/wespeaker/bin/score_norm.py b/wespeaker/bin/score_norm.py index d909e2bc..730bb448 100644 --- a/wespeaker/bin/score_norm.py +++ b/wespeaker/bin/score_norm.py @@ -12,19 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os -import kaldiio + import fire +import kaldiio import numpy as np from tqdm import tqdm -import logging from wespeaker.utils.file_utils import read_table def get_mean_std(emb, cohort, top_n): - emb = emb / np.sqrt(np.sum(emb**2, axis=1, keepdims=True)) - cohort = cohort / np.sqrt(np.sum(cohort**2, axis=1, keepdims=True)) + emb = emb / np.sqrt(np.sum(emb ** 2, axis=1, keepdims=True)) + cohort = cohort / np.sqrt(np.sum(cohort ** 2, axis=1, keepdims=True)) emb_cohort_score = np.matmul(emb, cohort.T) emb_cohort_score = np.sort(emb_cohort_score, axis=1)[:, ::-1] emb_cohort_score_topn = emb_cohort_score[:, :top_n] @@ -66,7 +67,7 @@ def main(score_norm_method, else: assert os.path.exists( mean_vec_path), "mean_vec file ({}) does not exist !!!".format( - mean_vec_path) + mean_vec_path) mean_vec = np.load(mean_vec_path) # get embedding @@ -103,8 +104,9 @@ def main(score_norm_method, score = float(line[2]) normed_score = 0.5 * ( - (score - enroll_mean[enroll_idx]) / enroll_std[enroll_idx] - + (score - test_mean[test_idx]) / test_std[test_idx]) + (score - enroll_mean[enroll_idx]) / enroll_std[ + enroll_idx] + + (score - test_mean[test_idx]) / test_std[test_idx]) fout.write('{} {} {:.5f} {}\n'.format(line[0], line[1], normed_score, line[3])) logging.info("Over!") diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py index ca618301..72ba6c1a 100644 --- a/wespeaker/bin/train.py +++ b/wespeaker/bin/train.py @@ -14,24 +14,25 @@ # limitations under the License. import os +import re from pprint import pformat + import fire -import yaml import tableprint as tp -import re - import torch import torch.distributed as dist +import yaml from torch.utils.data import DataLoader import wespeaker.utils.schedulers as schedulers -from wespeaker.models.speaker_model import get_speaker_model +from wespeaker.dataset.dataset import Dataset from wespeaker.models.projections import get_projection -from wespeaker.utils.utils import get_logger, parse_config_or_kwargs, set_seed, spk2id -from wespeaker.utils.file_utils import read_table -from wespeaker.utils.executor import run_epoch +from wespeaker.models.speaker_model import get_speaker_model from wespeaker.utils.checkpoint import load_checkpoint, save_checkpoint -from wespeaker.dataset.dataset import Dataset +from wespeaker.utils.executor import run_epoch +from wespeaker.utils.file_utils import read_table +from wespeaker.utils.utils import get_logger, parse_config_or_kwargs, set_seed, \ + spk2id def train(config='conf/config.yaml', **kwargs): @@ -117,7 +118,8 @@ def train(config='conf/config.yaml', **kwargs): configs['projection_args']['embed_dim'] = configs['model_args']['embed_dim'] configs['projection_args']['num_class'] = len(spk2id_dict) configs['projection_args']['do_lm'] = configs.get('do_lm', False) - if configs['data_type'] != 'feat' and configs['dataset_args']['speed_perturb']: + if configs['data_type'] != 'feat' and configs['dataset_args'][ + 'speed_perturb']: # diff speed is regarded as diff spk configs['projection_args']['num_class'] *= 3 if configs.get('do_lm', False): @@ -220,7 +222,7 @@ def train(config='conf/config.yaml', **kwargs): if rank == 0: if epoch % configs['save_epoch_interval'] == 0 or epoch >= configs[ - 'num_epochs'] - configs['num_avg']: + 'num_epochs'] - configs['num_avg']: save_checkpoint( model, os.path.join(model_dir, 'model_{}.pt'.format(epoch))) diff --git a/wespeaker/bin/train_deprecated.py b/wespeaker/bin/train_deprecated.py index 9e783d8c..b10b3aa4 100644 --- a/wespeaker/bin/train_deprecated.py +++ b/wespeaker/bin/train_deprecated.py @@ -13,25 +13,26 @@ # limitations under the License. import os +import re from pprint import pformat + import fire -import yaml import tableprint as tp -import re - import torch import torch.distributed as dist +import yaml from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler import wespeaker.utils.schedulers as schedulers -from wespeaker.models.speaker_model import get_speaker_model +from wespeaker.dataset.dataset_deprecated import FeatList_LableDict_Dataset from wespeaker.models.projections import get_projection -from wespeaker.utils.utils import get_logger, parse_config_or_kwargs, set_seed, spk2id -from wespeaker.utils.file_utils import read_scp -from wespeaker.utils.executor_deprecated import run_epoch +from wespeaker.models.speaker_model import get_speaker_model from wespeaker.utils.checkpoint import load_checkpoint, save_checkpoint -from wespeaker.dataset.dataset_deprecated import FeatList_LableDict_Dataset +from wespeaker.utils.executor_deprecated import run_epoch +from wespeaker.utils.file_utils import read_scp +from wespeaker.utils.utils import get_logger, parse_config_or_kwargs, set_seed, \ + spk2id def train(config='conf/config.yaml', **kwargs): @@ -120,7 +121,8 @@ def train(config='conf/config.yaml', **kwargs): # projection layer configs['projection_args']['embed_dim'] = configs['model_args']['embed_dim'] configs['projection_args']['num_class'] = len(spk2id_dict) - if configs['feature_args']['raw_wav'] and configs['dataset_args']['speed_perturb']: + if configs['feature_args']['raw_wav'] and configs['dataset_args'][ + 'speed_perturb']: # diff speed is regarded as diff spk configs['projection_args']['num_class'] *= 3 configs['projection_args']['do_lm'] = configs.get('do_lm', False) @@ -211,7 +213,7 @@ def train(config='conf/config.yaml', **kwargs): if rank == 0: if epoch % configs['save_epoch_interval'] == 0 or epoch >= configs[ - 'num_epochs'] - configs['num_avg']: + 'num_epochs'] - configs['num_avg']: save_checkpoint( model, os.path.join(model_dir, 'model_{}.pt'.format(epoch))) From 55e4846084c1139d35721708357a1f7f1eea1eb6 Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Tue, 18 Jul 2023 23:48:13 +0800 Subject: [PATCH 5/9] fix lint error --- wespeaker/bin/score_norm.py | 10 ++++------ wespeaker/bin/train.py | 4 ++-- wespeaker/bin/train_deprecated.py | 4 ++-- wespeaker/bin/train_plda.py | 3 ++- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/wespeaker/bin/score_norm.py b/wespeaker/bin/score_norm.py index 730bb448..cb2afd37 100644 --- a/wespeaker/bin/score_norm.py +++ b/wespeaker/bin/score_norm.py @@ -102,13 +102,11 @@ def main(score_norm_method, enroll_idx = enroll_utt2idx[line[0]] test_idx = test_utt2idx[line[1]] score = float(line[2]) - normed_score = 0.5 * ( - (score - enroll_mean[enroll_idx]) / enroll_std[ - enroll_idx] - + (score - test_mean[test_idx]) / test_std[test_idx]) - fout.write('{} {} {:.5f} {}\n'.format(line[0], line[1], - normed_score, line[3])) + (score - enroll_mean[enroll_idx]) / enroll_std[enroll_idx] + + (score - test_mean[test_idx]) / test_std[test_idx]) + fout.write('{} {} {:.5f} {}\n'.format( + line[0], line[1], normed_score, line[3])) logging.info("Over!") diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py index 72ba6c1a..81696cf0 100644 --- a/wespeaker/bin/train.py +++ b/wespeaker/bin/train.py @@ -119,7 +119,7 @@ def train(config='conf/config.yaml', **kwargs): configs['projection_args']['num_class'] = len(spk2id_dict) configs['projection_args']['do_lm'] = configs.get('do_lm', False) if configs['data_type'] != 'feat' and configs['dataset_args'][ - 'speed_perturb']: + 'speed_perturb']: # diff speed is regarded as diff spk configs['projection_args']['num_class'] *= 3 if configs.get('do_lm', False): @@ -222,7 +222,7 @@ def train(config='conf/config.yaml', **kwargs): if rank == 0: if epoch % configs['save_epoch_interval'] == 0 or epoch >= configs[ - 'num_epochs'] - configs['num_avg']: + 'num_epochs'] - configs['num_avg']: save_checkpoint( model, os.path.join(model_dir, 'model_{}.pt'.format(epoch))) diff --git a/wespeaker/bin/train_deprecated.py b/wespeaker/bin/train_deprecated.py index b10b3aa4..63a9c21b 100644 --- a/wespeaker/bin/train_deprecated.py +++ b/wespeaker/bin/train_deprecated.py @@ -122,7 +122,7 @@ def train(config='conf/config.yaml', **kwargs): configs['projection_args']['embed_dim'] = configs['model_args']['embed_dim'] configs['projection_args']['num_class'] = len(spk2id_dict) if configs['feature_args']['raw_wav'] and configs['dataset_args'][ - 'speed_perturb']: + 'speed_perturb']: # diff speed is regarded as diff spk configs['projection_args']['num_class'] *= 3 configs['projection_args']['do_lm'] = configs.get('do_lm', False) @@ -213,7 +213,7 @@ def train(config='conf/config.yaml', **kwargs): if rank == 0: if epoch % configs['save_epoch_interval'] == 0 or epoch >= configs[ - 'num_epochs'] - configs['num_avg']: + 'num_epochs'] - configs['num_avg']: save_checkpoint( model, os.path.join(model_dir, 'model_{}.pt'.format(epoch))) diff --git a/wespeaker/bin/train_plda.py b/wespeaker/bin/train_plda.py index c924b03e..7f464d1d 100644 --- a/wespeaker/bin/train_plda.py +++ b/wespeaker/bin/train_plda.py @@ -26,7 +26,8 @@ parser.add_argument('--type', type=str, default='2cov', - help='which type of plda to use, we only support kaldi 2cov version currently') + help='which type of plda to use, we only support ' + 'kaldi 2cov version currently') parser.add_argument('--scp_path', type=str, help='the plda training embedding.scp file') From 602cfee5489eb971b23cc14c968eabf9d00733a5 Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Wed, 19 Jul 2023 00:02:16 +0800 Subject: [PATCH 6/9] fix lint error --- examples/sre/v2/local/score_plda.sh | 1 - wespeaker/bin/train.py | 2 +- wespeaker/bin/train_plda.py | 2 +- wespeaker/utils/plda/two_cov_plda.py | 8 ++------ 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/examples/sre/v2/local/score_plda.sh b/examples/sre/v2/local/score_plda.sh index da09fe61..8b2f7802 100644 --- a/examples/sre/v2/local/score_plda.sh +++ b/examples/sre/v2/local/score_plda.sh @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - exp_dir= trials="trials trials_tgl trials_yue" data=data diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py index 81696cf0..e6d66747 100644 --- a/wespeaker/bin/train.py +++ b/wespeaker/bin/train.py @@ -222,7 +222,7 @@ def train(config='conf/config.yaml', **kwargs): if rank == 0: if epoch % configs['save_epoch_interval'] == 0 or epoch >= configs[ - 'num_epochs'] - configs['num_avg']: + 'num_epochs'] - configs['num_avg']: save_checkpoint( model, os.path.join(model_dir, 'model_{}.pt'.format(epoch))) diff --git a/wespeaker/bin/train_plda.py b/wespeaker/bin/train_plda.py index 7f464d1d..e030c94e 100644 --- a/wespeaker/bin/train_plda.py +++ b/wespeaker/bin/train_plda.py @@ -30,7 +30,7 @@ 'kaldi 2cov version currently') parser.add_argument('--scp_path', type=str, - help='the plda training embedding.scp file') + help='the plda training embedding.scp file') parser.add_argument('--utt2spk', type=str, help='utt2spk file') parser.add_argument('--indim', type=int, diff --git a/wespeaker/utils/plda/two_cov_plda.py b/wespeaker/utils/plda/two_cov_plda.py index e9163612..f2d1ba9d 100644 --- a/wespeaker/utils/plda/two_cov_plda.py +++ b/wespeaker/utils/plda/two_cov_plda.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections import math - import h5py import numpy as np import scipy.linalg as spl @@ -30,11 +30,7 @@ M_LOG_2PI = 1.8378770664093454835606594728112 -class ClassInfo(object): - def __init__(self, weight=0, num_example=0, mu=0): - self.weight = weight - self.num_example = num_example - self.mu = mu +ClassInfo = collections.namedtuple('ClassInfo', ['weight', 'num_example', 'mu']) class PldaStats(object): From ebfa7d05419f7d6e56993495e78346871a37b163 Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Wed, 19 Jul 2023 10:30:20 +0800 Subject: [PATCH 7/9] update vox plda recipe --- examples/__init__.py | 0 examples/sre/__init__.py | 0 examples/sre/v2/__init__.py | 0 examples/voxceleb/v2/local/score_plda.sh | 20 +++++++++++++++++--- 4 files changed, 17 insertions(+), 3 deletions(-) delete mode 100644 examples/__init__.py delete mode 100644 examples/sre/__init__.py delete mode 100644 examples/sre/v2/__init__.py diff --git a/examples/__init__.py b/examples/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/sre/__init__.py b/examples/sre/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/sre/v2/__init__.py b/examples/sre/v2/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/voxceleb/v2/local/score_plda.sh b/examples/voxceleb/v2/local/score_plda.sh index 7354094c..3ef8309e 100755 --- a/examples/voxceleb/v2/local/score_plda.sh +++ b/examples/voxceleb/v2/local/score_plda.sh @@ -26,14 +26,12 @@ stop_stage=-1 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "train the plda model ..." - mkdir -p ${exp_dir}/scores python wespeaker/bin/train_plda.py \ --exp_dir ${exp_dir} \ --scp_path ${exp_dir}/embeddings/vox2_dev/xvector.scp \ --utt2spk ${data}/vox2_dev/utt2spk \ --indim 256 \ - --iter 5 \ - --type '2cov' + --iter 5 echo "plda training finished" fi @@ -52,6 +50,22 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then done fi +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "apply plda scoring ..." + mkdir -p ${exp_dir}/scores + trials_dir=${data}/vox1/trials + for x in $trials; do + echo "scoring on " $x + python wespeaker/bin/eval_plda.py \ + --enroll_scp_path ${exp_dir}/embeddings/vox1/xvector.scp \ + --test_scp_path ${exp_dir}/embeddings/vox1/xvector.scp \ + --utt2spk <(cat ${data}/vox1/utt2spk | awk '{print $1, $1}') \ + --trial ${trials_dir}/${x} \ + --score_path ${exp_dir}/scores/${x}.pldascore \ + --model_path ${exp_dir}/plda + done +fi + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then echo "compute metrics (EER/minDCF) ..." scores_dir=${exp_dir}/scores From 334e45473d9f89712ed4c927a3d70f9d42dabfbd Mon Sep 17 00:00:00 2001 From: wangshuai Date: Wed, 19 Jul 2023 11:15:44 +0800 Subject: [PATCH 8/9] update vox plda recipe --- examples/voxceleb/v2/README.md | 9 ++++----- examples/voxceleb/v2/local/score_plda.sh | 14 -------------- wespeaker/utils/plda/plda_utils.py | 9 ++++----- 3 files changed, 8 insertions(+), 24 deletions(-) diff --git a/examples/voxceleb/v2/README.md b/examples/voxceleb/v2/README.md index 49b78a01..4397af1b 100644 --- a/examples/voxceleb/v2/README.md +++ b/examples/voxceleb/v2/README.md @@ -61,9 +61,8 @@ If you are interested in the PLDA scoring (which is inferior to the simple cosin local/score_plda.sh --stage 1 --stop-stage 3 --exp_dir exp_name ``` -The results on ResNet293 (large margin, no asnorm) are: +The results on ResNet34 (large margin, no asnorm) are: -|Scoring method| vox1-O-clean | vox1-E-clean | vox1-H-clean | -| :---:|:------------:|:------------:|:------------:| -|cosine| 0.532 | 0.707 | 1.311 | -|plda | 0.744 | 0.794 | 1.374| +| Scoring method | vox1-O-clean | vox1-E-clean | vox1-H-clean | +|:--------------:|:------------:|:------------:|:------------:| +| PLDA | 1.207 | 1.350 | 2.528 | diff --git a/examples/voxceleb/v2/local/score_plda.sh b/examples/voxceleb/v2/local/score_plda.sh index 3ef8309e..8b3c14e5 100755 --- a/examples/voxceleb/v2/local/score_plda.sh +++ b/examples/voxceleb/v2/local/score_plda.sh @@ -35,20 +35,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "plda training finished" fi -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - echo "apply plda scoring ..." - mkdir -p ${exp_dir}/scores - trials_dir=${data}/vox1/trials - for x in $trials; do - echo $x - python wespeaker/bin/eval_plda.py \ - --exp_dir ${exp_dir} \ - --enroll_scp_path ${exp_dir}/embeddings/vox1/xvector.scp \ - --test_scp_path ${exp_dir}/embeddings/vox1/xvector.scp \ - --utt2spk <(cat ${data}/vox1/utt2spk | awk '{print $1, $1}') \ - --trial ${trials_dir}/${x} - done -fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "apply plda scoring ..." diff --git a/wespeaker/utils/plda/plda_utils.py b/wespeaker/utils/plda/plda_utils.py index b1ad496c..0b34bdeb 100644 --- a/wespeaker/utils/plda/plda_utils.py +++ b/wespeaker/utils/plda/plda_utils.py @@ -74,11 +74,10 @@ def get_data_for_plda(scp_file, utt2spk_file): def compute_normalizing_transform(covar): - """ - :param covar: - :return: - """ - c = np.linalg.cholesky(covar) + try: + c = np.linalg.cholesky(covar) + except np.linalg.LinAlgError: + c = np.linalg.cholesky(covar + np.eye(covar.shape[0]) * 1e-6) c = np.linalg.inv(c) return c From b6240b1091dab44e623d7abc7d45a4d568f0ff83 Mon Sep 17 00:00:00 2001 From: Shuai Wang Date: Wed, 19 Jul 2023 11:19:45 +0800 Subject: [PATCH 9/9] Delete __init__.py --- wespeaker/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 wespeaker/__init__.py diff --git a/wespeaker/__init__.py b/wespeaker/__init__.py deleted file mode 100644 index e69de29b..00000000