In [1]:
import os, json, tqdm
import pandas as pd
import sacrebleu
from sacrebleu.metrics import BLEU
from speech_to_speech_translation.text_cleaner.cleaners import english_cleaners

src_lang, tgt_lang = "fr", "en"
gen_subset = "valid"
# transcript_txt_path = transcript_txt_path = "/opt/data/private/dsy/project/model/multimodal_S2UT/mm_s2ut/checkpoints/enhanced_fr-en/inference/tts_transcript.txt"
transcript_txt_path = "/opt/data/private/dsy/project/model/multimodal_S2UT/mm_s2ut/checkpoints/textless_fr-en/inference/tts_transcript.txt"
wav_dir = f"/opt/data/private/dsy/project/dataset/multi30k-dataset/data/speech/16khz_wav/{tgt_lang}/{gen_subset}"
ref_txt = f"/opt/data/private/dsy/project/dataset/multi30k-dataset/data/text-clean/{gen_subset}.{tgt_lang}"
tsv_path = f"/opt/data/private/dsy/project/dataset/multi30k-dataset/data/speech/format_data/fr-en/{gen_subset}.tsv"

tsv = pd.read_csv(tsv_path, sep='\t')
ref_id_list = tsv["id"].tolist()
ref_list = []
with open(ref_txt, mode="r+") as f:
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        ref_list.append(line)
hyp_list = []
hyp_ref_list = []
with open(transcript_txt_path, mode="r+") as f:
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        hyp_list.append(line)
        hyp_ref_list.append([line, ref_list[ref_id_list[len(hyp_list) - 1] - 1]])

In [2]:
def remove_end_punc(line):
    if line.endswith(" ."):
        line = line[: len(line) - 2]
    return line

# print(len(ref_list), len(hyp_list))
assert len(ref_list) == len(hyp_list)
for i in range(len(ref_list)):
    hyp, ref = hyp_ref_list[i]
    hyp = english_cleaners(hyp)
    ref = english_cleaners(ref)
    # hyp, ref = remove_end_punc(hyp), remove_end_punc(ref)
    hyp_ref_list[i] = [hyp, ref]
    # print(ref_id_list[i])
    # print("hyp: ", hyp)
    # print("ref: ", ref)
    # print()
bleu_score = sacrebleu.corpus_bleu(
    [hyp for hyp, _ in hyp_ref_list], 
    # [ref for _, ref in hyp_ref_list], 
    [[ref for _, ref in hyp_ref_list]]
)
print(bleu_score)

BLEU = 24.95 58.7/35.2/21.7/13.6 (BP = 0.893 ratio = 0.898 hyp_len = 11938 ref_len = 13289)


# 原始音频转录

In [16]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch, torchaudio, tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/root/autodl-tmp/liuwenrui/project/model/transformer_ckpt/wav2vec2-large-960h-lv60-self"

def generate_transcription(
    tts_wav_dir: str, 
    transcript_txt: str, 
):
    transcriptions = []
    # load model and processor
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    model = Wav2Vec2ForCTC.from_pretrained(model_path)
    model.to(device)
    tts_wav_files = [file for file in os.listdir(tts_wav_dir) if file.endswith(".wav")]
    # tts_wav_files.sort(key=lambda x: int(x.split('_')[0]))
    tts_wav_files.sort(key=lambda x: int(x.split('.')[0]))
    for wav_file in tqdm.tqdm(tts_wav_files):
        # if not wav_file.endswith(".wav"):
        #     continue
        wav_path = os.path.join(tts_wav_dir, wav_file)
        audio, sr = torchaudio.load(wav_path)
        audio = audio[0]
        # tokenize
        input_values = processor(audio, sampling_rate=sr, return_tensors="pt", padding="longest").input_values
        input_values = input_values.to(device)
        # retrieve logits
        logits = model(input_values).logits
        logits = logits.cpu()
        # take argmax and decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)
        transcription = transcription[0]
        transcriptions.append(transcription)
    with open(transcript_txt, mode="w+") as f:
        f.write('\n'.join(transcriptions))

In [23]:
# subset = "valid.en"
# subset = "test.2016.en"
# subset = "test.2017.en"
# subset = "test.coco.en"
subset = "train.en"
audio_dir = f"/root/autodl-tmp/liuwenrui/project/dataset/multi30k-dataset/data/speech/16khz_wav/{subset}"
output_dir = "/root/autodl-tmp/liuwenrui/project/model/multimodal_S2UT/mm_s2ut/checkpoints/wav2vec2-en/ori"
generate_transcription(
    tts_wav_dir=audio_dir, 
    transcript_txt=os.path.join(output_dir, f"{subset}.txt"), 
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at /root/autodl-tmp/liuwenrui/project/model/transformer_ckpt/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 29000/29000 [11:43<00:00, 41.20it/s]


In [47]:
import os, json, tqdm
import pandas as pd
import sacrebleu
import evaluate
from sacrebleu.metrics import BLEU
from speech_to_speech_translation.text_cleaner.cleaners import english_cleaners
wer_evaluator = evaluate.load("wer")

# subset = "train.en"
# subset = "valid.en"
# subset = "test.2016.en"
# subset = "test.2017.en"
subset = "test.coco.en"

transcript_txt_path = os.path.join(output_dir, f"{subset}.txt")
ref_txt = f"/root/autodl-tmp/liuwenrui/project/dataset/multi30k-dataset/data/text-clean/{subset}"

ref_list = []
with open(ref_txt, mode="r+") as f:
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        ref_list.append(line)
hyp_list = []
with open(transcript_txt_path, mode="r+") as f:
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        hyp_list.append(line)

def remove_end_punc(line):
    if line.endswith(" ."):
        line = line[: len(line) - 2]
    return line

# print(len(ref_list), len(hyp_list))
assert len(ref_list) == len(hyp_list)
for i in range(len(ref_list)):
    hyp, ref = hyp_list[i], ref_list[i]
    hyp = english_cleaners(hyp)
    ref = english_cleaners(ref)
    hyp, ref = remove_end_punc(hyp), remove_end_punc(ref)
    hyp_list[i], ref_list[i] = hyp, ref
    # print("hyp: ", hyp)
    # print("ref: ", ref)
    # print()
bleu_score = sacrebleu.corpus_bleu(
    [hyp for hyp in hyp_list], 
    [[ref for ref in ref_list]]
)
print(bleu_score)
wer = wer_evaluator.compute(predictions=hyp_list, references=ref_list)
print(f"word_error_rate = {wer}")

BLEU = 89.44 95.0/91.3/87.8/84.1 (BP = 1.000 ratio = 1.008 hyp_len = 4899 ref_len = 4860)
word_error_rate = 0.05576131687242798


## original audio transcript
| subset       | WER   | ASR BLEU |
| -----------  | ----- | ----------- |
| train.en     | 6.22% | BLEU = 88.44 95.1/90.6/86.4/82.2 (BP = 1.000 ratio = 1.000 hyp_len = 349629 ref_len = 349505) |
| valid.en     | 6.21% | BLEU = 88.49 95.1/90.7/86.5/82.2 (BP = 1.000 ratio = 1.001 hyp_len = 12331 ref_len = 12323) |
| test.2016.en | 5.49% | BLEU = 89.70 95.6/91.5/87.9/84.2 (BP = 1.000 ratio = 1.001 hyp_len = 12028 ref_len = 12012) |
| test.2017.en | 5.34% | BLEU = 90.03 95.7/91.8/88.3/84.8 (BP = 1.000 ratio = 1.001 hyp_len = 10632 ref_len = 10617) |
| test.coco.en | 5.57% | BLEU = 89.44 95.0/91.3/87.8/84.1 (BP = 1.000 ratio = 1.008 hyp_len = 4899 ref_len = 4860)   |

# unit HiFiGAN重建音频

In [62]:
import os, json, tqdm
import pandas as pd
import soundfile as sf
import IPython.display as ipd
from fairseq import hub_utils
from fairseq.models.text_to_speech import CodeHiFiGANVocoder
from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface

# subset = "train.en"
# subset = "valid.en"
# subset = "test.2016.en"
# subset = "test.2017.en"
subset = "test.coco.en"
tsv_path = f"/root/autodl-tmp/liuwenrui/project/dataset/multi30k-dataset/data/speech/format_data/fr-en_enhanced/{'.'.join(subset.split('.')[:-1])}.tsv"
output_dir = f"/root/autodl-tmp/liuwenrui/project/model/multimodal_S2UT/mm_s2ut/checkpoints/unit_hifigan/{subset}"
os.makedirs(output_dir, exist_ok=True)

cache_dir = "/root/autodl-tmp/liuwenrui/project/model/transformer_ckpt/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur"
x = hub_utils.from_pretrained(
    cache_dir,
    "model.pt",
    ".",
    archive_map=CodeHiFiGANVocoder.hub_models(),
    config_yaml="config.json",
    fp16=False,
    is_vocoder=True,
)
with open(f"{x['args']['data']}/config.json") as f:
    vocoder_cfg = json.load(f)
assert (
    len(x["args"]["model_path"]) == 1
), "Too many vocoder models in the input"
vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg)
tts_model = VocoderHubInterface(vocoder_cfg, vocoder)

tsv = pd.read_csv(tsv_path, sep='\t')
for i, row in tqdm.tqdm(tsv.iterrows(), total=tsv.shape[0]):
    unit = row["tgt_text"]
    tts_sample = tts_model.get_model_input(unit)
    wav, sr = tts_model.get_prediction(tts_sample)
    sf.write(os.path.join(output_dir, f"{row['id']}.wav"), wav, sr)

2023-05-02 23:31:17 | INFO | fairseq.file_utils | loading archive file /root/autodl-tmp/liuwenrui/project/model/transformer_ckpt/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur
2023-05-02 23:31:17 | INFO | fairseq.models.text_to_speech.vocoder | loaded CodeHiFiGAN checkpoint from /root/autodl-tmp/liuwenrui/project/model/transformer_ckpt/unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur/model.pt


Removing weight norm...


100%|██████████| 461/461 [01:49<00:00,  4.20it/s]


In [63]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch, torchaudio, tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/root/autodl-tmp/liuwenrui/project/model/transformer_ckpt/wav2vec2-large-960h-lv60-self"

def generate_transcription(
    tts_wav_dir: str, 
    transcript_txt: str, 
):
    transcriptions = []
    # load model and processor
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    model = Wav2Vec2ForCTC.from_pretrained(model_path)
    model.to(device)
    tts_wav_files = [file for file in os.listdir(tts_wav_dir) if file.endswith(".wav")]
    # tts_wav_files.sort(key=lambda x: int(x.split('_')[0]))
    tts_wav_files.sort(key=lambda x: int(x.split('.')[0]))
    for wav_file in tqdm.tqdm(tts_wav_files):
        # if not wav_file.endswith(".wav"):
        #     continue
        wav_path = os.path.join(tts_wav_dir, wav_file)
        audio, sr = torchaudio.load(wav_path)
        audio = audio[0]
        # tokenize
        input_values = processor(audio, sampling_rate=sr, return_tensors="pt", padding="longest").input_values
        input_values = input_values.to(device)
        # retrieve logits
        logits = model(input_values).logits
        logits = logits.cpu()
        # take argmax and decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)
        transcription = transcription[0]
        transcriptions.append(transcription)
    with open(transcript_txt, mode="w+") as f:
        f.write('\n'.join(transcriptions))

audio_dir = output_dir
output_dir = "/root/autodl-tmp/liuwenrui/project/model/multimodal_S2UT/mm_s2ut/checkpoints/wav2vec2-en/reconstruct"
generate_transcription(
    tts_wav_dir=audio_dir, 
    transcript_txt=os.path.join(output_dir, f"{subset}.txt"), 
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at /root/autodl-tmp/liuwenrui/project/model/transformer_ckpt/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 461/461 [00:10<00:00, 44.60it/s]


In [64]:
import os, json, tqdm
import pandas as pd
import sacrebleu
import evaluate
from sacrebleu.metrics import BLEU
from speech_to_speech_translation.text_cleaner.cleaners import english_cleaners
wer_evaluator = evaluate.load("wer")

# subset = "train.en"
# subset = "valid.en"
# subset = "test.2016.en"
# subset = "test.2017.en"
# subset = "test.coco.en"

transcript_txt_path = os.path.join(output_dir, f"{subset}.txt")
ref_txt = f"/root/autodl-tmp/liuwenrui/project/dataset/multi30k-dataset/data/text-clean/{subset}"

ref_list = []
with open(ref_txt, mode="r+") as f:
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        ref_list.append(line)
hyp_list = []
with open(transcript_txt_path, mode="r+") as f:
    for line in f.readlines():
        line = line.strip()
        if len(line) == 0:
            continue
        hyp_list.append(line)

def remove_end_punc(line):
    if line.endswith(" ."):
        line = line[: len(line) - 2]
    return line

# print(len(ref_list), len(hyp_list))
assert len(ref_list) == len(hyp_list)
for i in range(len(ref_list)):
    hyp, ref = hyp_list[i], ref_list[i]
    hyp = english_cleaners(hyp)
    ref = english_cleaners(ref)
    hyp, ref = remove_end_punc(hyp), remove_end_punc(ref)
    hyp_list[i], ref_list[i] = hyp, ref
    # print("hyp: ", hyp)
    # print("ref: ", ref)
    # print()
bleu_score = sacrebleu.corpus_bleu(
    [hyp for hyp in hyp_list], 
    [[ref for ref in ref_list]]
)
print(subset)
print(bleu_score)
wer = wer_evaluator.compute(predictions=hyp_list, references=ref_list)
print(f"word_error_rate = {wer}")

test.coco.en
BLEU = 72.03 86.1/76.6/68.3/60.5 (BP = 0.997 ratio = 0.997 hyp_len = 4844 ref_len = 4860)
word_error_rate = 0.15864197530864196


## reconstruct transcript
| subset       | WER   | ASR BLEU |
| -----------  | ----- | ----------- |
| train.en     | 15.42% | BLEU = 71.79 87.0/77.0/68.4/60.6 (BP = 0.989 ratio = 0.989 hyp_len = 345592 ref_len = 349505) |
| valid.en     | 14.90% | BLEU = 72.85 87.3/77.8/69.5/61.7 (BP = 0.992 ratio = 0.992 hyp_len = 12222 ref_len = 12323) |
| test.2016.en | 15.29% | BLEU = 71.85 86.9/76.9/68.3/60.5 (BP = 0.991 ratio = 0.992 hyp_len = 11910 ref_len = 12012) |
| test.2017.en | 16.07% | BLEU = 70.88 86.0/75.7/67.3/59.7 (BP = 0.991 ratio = 0.991 hyp_len = 10524 ref_len = 10617) |
| test.coco.en | 15.86% | BLEU = 72.03 86.1/76.6/68.3/60.5 (BP = 0.997 ratio = 0.997 hyp_len = 4844 ref_len = 4860) |