<a href="https://colab.research.google.com/github/y-chiba1008/talk-support-asr/blob/main/notebooks/dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ! pip install -q "datasets[audio]" audiomentations soundfile
! pip install -q "datasets[audio]"

In [None]:
# ! pip uninstall -y torchcodec

In [None]:
# import sys

# # torchcodecをシステムから完全に隠蔽する
# sys.modules['torchcodec'] = None

In [None]:
# ドライブのdataフォルダをマウント

from google.colab import drive
import os

## Google Driveをマウント
drive.mount('/drive')

## シンボリックリンクの作成
### drive_folder_path: ドライブ内の目的のフォルダのパス
drive_folder_path = '/drive/Othercomputers/マイ コンピュータ/data'

### colab_link_path: Colab内の短縮されたアクセスパス
colab_link_path = '/content/data'

### リンク先が既に存在する場合は削除
if os.path.isdir(colab_link_path):
    print(f'{colab_link_path}がすでに存在する為、一度削除します')
    !rm -rf "$colab_link_path"

### シンボリックリンクを作成
!ln -s "$drive_folder_path" "$colab_link_path"

In [None]:
# 定数
# data_name = '01_short'
# data_name = '02_all'
data_name = '03_data_augment'
APPLY_AUGMENT = True

WAV_DIR = 'data/wav'
JSON_PATH = f'data/{data_name}/label_studio.json'
AUDIOFOLDER_PATH = f'data/{data_name}/audiofolder'
PREPROCESSED_DATA_PATH = f'data/{data_name}/preprocessed_data'
BASE_MODEL = 'openai/whisper-small'
NUM_PROC = 4
SEED = 42

## audiofolder作成

In [None]:
from pathlib import Path

def reshape_data(json_data: list[dict]) -> list[dict]:
    '''下記の形に整形する
      - wav_path
      - start
      - end
      - sentence
    '''

    label_datas = []
    for task in json_data:
        wav_dir_path = Path(WAV_DIR)
        orgfile_path = Path(task['data']['audio'])
        file_path = wav_dir_path / orgfile_path.name

        results = []
        for ano in task['annotations']:
            results += ano['result']
        results = filter(lambda res: res['type'] == 'textarea', results)

        for res in results:
            label_datas.append({
                'wav_path': str(file_path),
                'start': res['value']['start'],
                'end': res['value']['end'],
                'sentence': res['value']['text'][0]
            })
    return label_datas

In [None]:
import csv
import json
import random
import shutil
import librosa
import soundfile as sf
import numpy as np
from pathlib import Path
from tqdm import tqdm
# from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift


def create_audiofolder(json_path, audiofolder_path,
                       train_size, valid_size, test_size,
                       apply_augment=False, # 音声データにノイズを付与するデータ拡張を行うかどうか
                       ):
    # json読み込み
    with open(json_path) as f:
        json_data = json.load(f)

    # 整形
    data_list = reshape_data(json_data)

    # ランダムにtrain, validation, testに分ける
    n_all = len(data_list)
    n_train = int(n_all * train_size)
    n_valid = int(n_all * valid_size)
    n_test = n_all - n_train - n_valid
    dirname = ['train'] * n_train + ['validation'] * n_valid + ['test'] * n_test
    random.seed(SEED)
    random.shuffle(dirname)
    audiofolder_path = Path(audiofolder_path)
    for i, record in enumerate(data_list):
        record['save_path'] = audiofolder_path / dirname[i] / f'{i:03d}.wav'

    # データ拡張の定義
    # if apply_augment:
    #     augment = Compose([
    #         # ガウスノイズを追加（音量の5%〜15%程度のノイズ）
    #         AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    #         # 速度を0.8倍〜1.2倍の間でランダムに変える
    #         TimeStretch(min_rate=0.8, max_rate=1.2, p=0.5),
    #         # ピッチ（声の高さ）を上下に2セミトーン変える
    #         PitchShift(min_semitones=-2, max_semitones=2, p=0.5),
    #     ])
    # else:
    #     augment = None

    # audiofolderをリセット（消去）
    if audiofolder_path.exists():
        shutil.rmtree(audiofolder_path)

    for record in tqdm(data_list):
        # 切り取り秒数
        start = record['start']
        end = record['end']
        duration = end - start

        # 保存先フォルダを作成
        # 音声データを読み込み（兼切り取り、サンプリングレート変換、モノラル化）
        wav_path = record['wav_path']
        data, sr = librosa.load(wav_path,
                                offset=start,
                                duration=duration,
                                sr=16000,
                                mono=True)

        # データ拡張
        # if augment is not None:
        #     data = augment(samples=data.astype(np.float32), sample_rate=16000)

        # 保存先フォルダを作成して保存
        save_path = record['save_path']
        save_path.parent.mkdir(parents=True, exist_ok=True)
        sf.write(save_path, data, sr)

        # metadata.csvに追加
        csv_path = save_path.parent / 'metadata.csv'
        new_file = not csv_path.exists()
        with open(csv_path, 'a') as f:
            writer = csv.DictWriter(f, fieldnames=['file_name', 'sentence'])
            row = {'file_name': save_path.name, 'sentence': record['sentence']}

            if new_file:
                writer.writeheader()
                writer.writerow(row)
            else:
                writer.writerow(row)

In [None]:
#############################
# メイン処理（audiofolder） #
#############################
create_audiofolder(
    json_path=JSON_PATH,
    audiofolder_path=AUDIOFOLDER_PATH,
    train_size=0.8,
    valid_size=0.1,
    test_size=0.1,
    # apply_augment=APPLY_AUGMENT,
    apply_augment=False,
)

In [None]:
from IPython.display import Audio, display
from datasets import load_dataset

def play_data(type, idx):
    dataset = load_dataset('audiofolder', data_dir=AUDIOFOLDER_PATH);
    record = dataset[type][idx]

    display(Audio(data=record['audio']['array'], rate=record['audio']['sampling_rate']))
    print(record['sentence'])

play_data('train', 0)

In [None]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(BASE_MODEL,
                                             language='Japanese',
                                             task='transcribe');

def create_features_and_labels(record):
    '''特徴量抽出と教師ラベルのトークン化(map用)
    '''
    # load and resample audio data from 48 to 16kHz
    audio = record['audio']

    # compute log-Mel input features from input audio array
    record['input_features'] = processor.feature_extractor(audio['array']
        , sampling_rate=audio['sampling_rate']).input_features[0]

    # encode target text to label ids
    record['labels'] = processor.tokenizer(record['sentence']).input_ids
    return record

In [None]:
from datasets import load_dataset

def create_preprocessed_data(audiofolder_path, preprocessed_data_path):
    # audiofolder読み込み
    dataset = load_dataset('audiofolder', data_dir=audiofolder_path)

    # 特徴量抽出とトークン化
    dataset = dataset.map(create_features_and_labels,
                          remove_columns=dataset.column_names['train'],
                          num_proc=NUM_PROC)

    # preprocessed_dataをリセット（消去）
    preprocessed_data_path = Path(preprocessed_data_path)
    if preprocessed_data_path.exists():
        shutil.rmtree(preprocessed_data_path)

    # 保存
    dataset.save_to_disk(preprocessed_data_path)

In [None]:
###################################
# メイン処理（preprocessed_data） #
###################################
create_preprocessed_data(
    audiofolder_path=AUDIOFOLDER_PATH,
    preprocessed_data_path=PREPROCESSED_DATA_PATH,
)