# 统计音频时长

In [None]:
import os, pydub, tqdm

# lang = "en"
lang = "fr"
audio_root = "/opt/data/private/dsy/project/dataset/multi30k-dataset/data/speech/16khz_wav"
audio_root = os.path.join(audio_root, lang)
subset_list = ["train", "valid", "test.2016", "test.2017", "test.coco"]
subset_info = {}

for dir in tqdm.tqdm(os.listdir(audio_root)):
    audio_dir = os.path.join(audio_root, dir)
    if not os.path.isdir(audio_dir):
        continue
    subset_info[dir] = [0.0, 0]
    for file in os.listdir(audio_dir):
        if not file.endswith(".wav"):
            continue
        audio_path = os.path.join(audio_dir, file)
        audio = pydub.AudioSegment.from_wav(audio_path)
        subset_info[dir][0] += audio.duration_seconds
        subset_info[dir][1] += 1
for subset in subset_list:
    print(subset)
    m = subset_info[subset][0] / 60
    h = m / 60
    print(f"\tduration = {subset_info[subset][0]}s = {h}h")
    print(f"\titem = {subset_info[subset][1]}")
    # print()

| Dataset   | Number of Items | Duration (hours)-En | Duration (hours)-Fr | Duration (hours)-Es |
|-----------|-----------------|---------------------|---------------------|---------------------|
| train     | 29000           | 34.56               | 34.52               |                     |
| valid     | 1014            | 1.21                | 1.21                |                     |
| test.2016 | 1000            | 1.19                | 1.19                |                     |
| test.2017 | 1000            | 1.09                | 1.11                |                     |
| test.coco | 461             | 0.49                | 0.50                |                     |


# chech for multi30k dataset

In [None]:
import os, tqdm
from sacrebleu.metrics import BLEU

src_lang = "fr"
tgt_lang = "en"
ref_root = "/opt/data/private/dsy/project/dataset/multi30k-dataset/data/text-clean"
subset_list = ["train", "valid", "test.2016", "test.2017", "test.coco"]
subset_info = {}

for subset in tqdm.tqdm(subset_list):
    hyp_txt = os.path.join(ref_root, f"{subset}.{src_lang}")
    ref_txt = os.path.join(ref_root, f"{subset}.{tgt_lang}")
    hyp_list, ref_list = [], []
    with open(ref_txt, mode="r+") as f:
        for line in f.readlines():
            line = line.strip()
            if len(line) == 0:
                continue
            ref_list.append(line)
    with open(hyp_txt, mode="r+") as f:
        for line in f.readlines():
            line = line.strip()
            if len(line) == 0:
                continue
            hyp_list.append(line)
    assert len(ref_list) == len(hyp_list)
    for i in range(len(ref_list)):
        hyp, ref = hyp_list[i], ref_list[i]
        hyp, ref = remove_end_punc(hyp), remove_end_punc(ref)
        hyp_list[i], ref_list[i] = hyp, ref
        # print("hyp: ", hyp)
        # print("ref: ", ref)
        # print()
    bleu_score = sacrebleu.corpus_bleu(
        [hyp for hyp in hyp_list], 
        [[ref for ref in ref_list]]
    )
    subset_info[subset] = bleu_score
for subset in subset_list:
    print(subset)
    print(f"\t{subset_info[subset][0]}")

# chech for synthesized dataset

In [9]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch, torchaudio, tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = "/opt/data/private/dsy/project/checkpoint/wav2vec2-large-960h-lv60-self"
output_dir_root = "/opt/data/private/dsy/project/model/multimodal_S2UT/mm_s2ut/checkpoints/transcript"

processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path)
model.to(device)

lang = "en"
# lang = "fr"
audio_root = "/opt/data/private/dsy/project/dataset/multi30k-dataset/data/speech/16khz_wav"
audio_root = os.path.join(audio_root, lang)
ref_root = "/opt/data/private/dsy/project/dataset/multi30k-dataset/data/text-clean"
subset_list = ["train", "valid", "test.2016", "test.2017", "test.coco"]
# subset_list = ["test.coco"]
subset_info = {}

for subset in tqdm.tqdm(subset_list):
    audio_dir = os.path.join(audio_root, subset)
    if not os.path.isdir(audio_dir):
        continue
    transcriptions = []
    audio_list = os.listdir(audio_dir)
    audio_list.sort(key=lambda x: int(x.split('.')[0]))
    for file in audio_list:
        if not file.endswith(".wav"):
            continue
        audio_path = os.path.join(audio_dir, file)
        audio, sr = torchaudio.load(audio_path)
        audio = audio[0]
        input_values = processor(audio, sampling_rate=sr, return_tensors="pt", padding="longest").input_values
        input_values = input_values.to(device)
        logits = model(input_values).logits
        logits = logits.cpu()
        # take argmax and decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)
        transcription = transcription[0]
        transcriptions.append(transcription)
    transcript_txt = os.path.join(output_dir_root, f"{subset}.{lang}")
    with open(transcript_txt, mode="w+") as f:
        f.write('\n'.join(transcriptions))

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at /opt/data/private/dsy/project/checkpoint/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5/5 [12:08<00:00, 145.66s/it]


In [11]:
import os, tqdm
import evaluate
import sacrebleu
from sacrebleu.metrics import BLEU
from speech_to_speech_translation.text_cleaner.cleaners import english_cleaners
from wer import WER
wer_evaluator = WER()
# wer_evaluator = evaluate.load("wer")

lang = "en"
# lang = "fr"
hyp_root = "/opt/data/private/dsy/project/model/multimodal_S2UT/mm_s2ut/checkpoints/transcript"
ref_root = "/opt/data/private/dsy/project/dataset/multi30k-dataset/data/text-clean"
subset_list = ["train", "valid", "test.2016", "test.2017", "test.coco"]
# subset_list = ["test.coco"]
subset_info = {}

def remove_end_punc(line):
    if line.endswith(" ."):
        line = line[: len(line) - 2]
    return line

for subset in tqdm.tqdm(subset_list):
    hyp_txt = os.path.join(hyp_root, f"{subset}.{lang}")
    ref_txt = os.path.join(ref_root, f"{subset}.{lang}")
    hyp_list, ref_list = [], []
    with open(ref_txt, mode="r+") as f:
        for line in f.readlines():
            line = line.strip()
            if len(line) == 0:
                continue
            ref_list.append(line)
    with open(hyp_txt, mode="r+") as f:
        for line in f.readlines():
            line = line.strip()
            if len(line) == 0:
                continue
            hyp_list.append(line)
    assert len(ref_list) == len(hyp_list)
    for i in range(len(ref_list)):
        hyp, ref = hyp_list[i], ref_list[i]
        hyp, ref = english_cleaners(hyp), english_cleaners(ref)
        hyp, ref = remove_end_punc(hyp), remove_end_punc(ref)
        hyp_list[i], ref_list[i] = hyp, ref
        # print("hyp: ", hyp)
        # print("ref: ", ref)
        # print()
    bleu_score = sacrebleu.corpus_bleu(
        [hyp for hyp in hyp_list], 
        [[ref for ref in ref_list]]
    )
    wer = wer_evaluator.compute(predictions=hyp_list, references=ref_list)
    subset_info[subset] = [bleu_score, wer]
for subset in subset_list:
    print(subset)
    print(f"\t{subset_info[subset][0]}")
    print(f"\tword_error_rate = {subset_info[subset][1]}")

100%|██████████| 5/5 [00:08<00:00,  1.74s/it]

train
	BLEU = 88.44 95.1/90.6/86.4/82.2 (BP = 1.000 ratio = 1.000 hyp_len = 349629 ref_len = 349505)
	word_error_rate = 0.06228860846424586
valid
	BLEU = 88.49 95.1/90.7/86.5/82.2 (BP = 1.000 ratio = 1.001 hyp_len = 12331 ref_len = 12323)
	word_error_rate = 0.06218541971099204
test.2016
	BLEU = 89.70 95.6/91.5/87.9/84.2 (BP = 1.000 ratio = 1.001 hyp_len = 12028 ref_len = 12012)
	word_error_rate = 0.05496793537103356
test.2017
	BLEU = 90.03 95.7/91.8/88.3/84.8 (BP = 1.000 ratio = 1.001 hyp_len = 10632 ref_len = 10617)
	word_error_rate = 0.053420011305822496
test.coco
	BLEU = 89.44 95.0/91.3/87.8/84.1 (BP = 1.000 ratio = 1.008 hyp_len = 4899 ref_len = 4860)
	word_error_rate = 0.05576131687242798





In [12]:
for subset in subset_list:
    print(subset)
    print(f"\t{subset_info[subset][0]}")
    print(f"\tword_error_rate = {subset_info[subset][1]}")

train
	BLEU = 88.44 95.1/90.6/86.4/82.2 (BP = 1.000 ratio = 1.000 hyp_len = 349629 ref_len = 349505)
	word_error_rate = 0.06228860846424586
valid
	BLEU = 88.49 95.1/90.7/86.5/82.2 (BP = 1.000 ratio = 1.001 hyp_len = 12331 ref_len = 12323)
	word_error_rate = 0.06218541971099204
test.2016
	BLEU = 89.70 95.6/91.5/87.9/84.2 (BP = 1.000 ratio = 1.001 hyp_len = 12028 ref_len = 12012)
	word_error_rate = 0.05496793537103356
test.2017
	BLEU = 90.03 95.7/91.8/88.3/84.8 (BP = 1.000 ratio = 1.001 hyp_len = 10632 ref_len = 10617)
	word_error_rate = 0.053420011305822496
test.coco
	BLEU = 89.44 95.0/91.3/87.8/84.1 (BP = 1.000 ratio = 1.008 hyp_len = 4899 ref_len = 4860)
	word_error_rate = 0.05576131687242798


| Dataset   | BLEU  | Precision | Recall | Hyp_len | Ref_len | Word_error_rate |
| --------- | ----- | ----------------------------- | ------------------------------- | ------- | ------- | --------------- |
| train     | 88.44 | 95.1/90.6/86.4/82.2           | (BP = 1.000 ratio = 1.000)        | 349629  | 349505  | 0.062           |
| valid     | 88.49 | 95.1/90.7/86.5/82.2           | (BP = 1.000 ratio = 1.001)        | 12331   | 12323   | 0.062           |
| test.2016 | 89.70 | 95.6/91.5/87.9/84.2           | (BP = 1.000 ratio = 1.001)        | 12028   | 12012   | 0.055           |
| test.2017 | 90.03 | 95.7/91.8/88.3/84.8           | (BP = 1.000 ratio = 1.001)        | 10632   | 10617   | 0.053           |
| test.coco | 89.44 | 95.0/91.3/87.8/84.1           | (BP = 1.000 ratio = 1.008)        | 4899    | 4860    | 0.056           |

# test, test1, test2 ?

In [2]:
import os, torch

pth_dir = "/opt/data/private/dsy/project/dataset/multi30k-dataset/data/image_feat/vit_base_patch16_384"
pth_list = ["test", "test1", "test2"]
for pth_name in pth_list:
    pth_file = torch.load(os.path.join(pth_dir, f"{pth_name}.pth"))
    print(pth_name, pth_file.shape)

test torch.Size([1000, 577, 768])
test1 torch.Size([1000, 577, 768])
test2 torch.Size([461, 577, 768])


| test名称    | test顺序 |
|-----------|--------|
| test.2016 | test   |
| test.2017 | test1  |
| test.coco | test2  |