In [1]:
import torch
from torch import nn
import pandas as pd
from torch.utils.data import Dataset
import torchaudio
from torch.utils.data import DataLoader
import os
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [2]:
import flash
from flash.audio import SpeechRecognition, SpeechRecognitionData

2023-05-22 10:08:41.695028: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-22 10:08:41.741110: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print(torch.__version__)
print(torchaudio.__version__)

2.0.1+cu117
2.0.2+cu117


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
df = pd.read_csv("text/part1.csv")
df

Unnamed: 0,id,form,original_form,speaker_id,start,end,age,sex
0,SDRW2000000319.1.1.1,병역 특례를 받아,병역 특례를 받아,SD2001645,4.04903,5.83905,10대,여성
1,SDRW2000000319.1.1.2,법정 봉사활동 기 시간을 채워야 하는,법정 봉사활동 기 시간을 채워야 하는,SD2001645,5.84901,8.89405,10대,여성
2,SDRW2000000319.1.1.3,예술,예술,SD2001645,8.90407,9.52506,10대,여성
3,SDRW2000000319.1.1.4,또는 체육 요원의 절반가량이,또는 체육 요원의 절반가량이,SD2001645,9.53506,12.05203,10대,여성
4,SDRW2000000319.1.1.5,허위 자료를 내거나,허위 자료를 내거나,SD2001645,12.06204,13.79504,10대,여성
...,...,...,...,...,...,...,...,...
213188,SDRW2000000418.1.1.326,우선,우선,SD2000552,908.12707,909.98106,10대,여성
213189,SDRW2000000418.1.1.327,맛있는 음식들 먹으면서,맛있는 음식들 먹으면서,SD2000552,909.99104,912.25405,10대,여성
213190,SDRW2000000418.1.1.328,겝,겝,SD2000552,912.26403,913.64807,10대,여성
213191,SDRW2000000418.1.1.329,먹으면서 저도 같이 맛있어 보이는 느낌이라서,먹으면서 저도 같이 맛있어 보이는 느낌이라서,SD2000552,913.65802,917.87305,10대,여성


In [6]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, wav_dir):
        self.data = dataframe
        self.wav_dir = wav_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        file_id = self.data.iloc[index]['id']
        audio = os.path.join(self.wav_dir, f'{file_id}.wav')
        text = str(self.data.iloc[index]['form'])  

        max_len = len(text)
        text = text.ljust(max_len)  

        return audio, text

# 데이터셋 생성
wav_dir = './wav_all_stereo(fixed_length)'
dataset = CustomDataset(df, wav_dir)

In [7]:
audio, text = dataset[0]
print("Audio:", audio)
print("Text:", text)

Audio: ./wav_all_stereo(fixed_length)/SDRW2000000319.1.1.1.wav
Text: 병역 특례를 받아


In [8]:
data = []
for i in range(len(dataset)):
    audio, text = dataset[i]
    data.append([audio, text])
df = pd.DataFrame(data, columns=['audio', 'text'])

In [9]:
df

Unnamed: 0,audio,text
0,./wav_all_stereo(fixed_length)/SDRW2000000319....,병역 특례를 받아
1,./wav_all_stereo(fixed_length)/SDRW2000000319....,법정 봉사활동 기 시간을 채워야 하는
2,./wav_all_stereo(fixed_length)/SDRW2000000319....,예술
3,./wav_all_stereo(fixed_length)/SDRW2000000319....,또는 체육 요원의 절반가량이
4,./wav_all_stereo(fixed_length)/SDRW2000000319....,허위 자료를 내거나
...,...,...
213188,./wav_all_stereo(fixed_length)/SDRW2000000418....,우선
213189,./wav_all_stereo(fixed_length)/SDRW2000000418....,맛있는 음식들 먹으면서
213190,./wav_all_stereo(fixed_length)/SDRW2000000418....,겝
213191,./wav_all_stereo(fixed_length)/SDRW2000000418....,먹으면서 저도 같이 맛있어 보이는 느낌이라서


In [10]:
df.isnull().sum()

audio    0
text     0
dtype: int64

In [11]:
df1 = df[:200000]

In [12]:
df2 = df[200000:]

In [13]:
df1.to_csv('train.csv',index=False,encoding='utf-8')

In [14]:
df2.to_csv('test.csv',index=False,encoding='utf-8')

In [23]:
datamodule = SpeechRecognitionData.from_csv(
    'audio',
    'text',
    train_file='train.csv',
    val_file='test.csv',
    batch_size=8  
)

### http://mohitmayank.com/a_lazy_data_science_guide/audio_intelligence/wav2vec2/

In [17]:
# wav2vec 모델 초기화
model = SpeechRecognition(backbone="facebook/wav2vec2-base-960h")

Using 'facebook/wav2vec2-base-960h' provided by Hugging Face/transformers (https://github.com/huggingface/transformers) and PyTorch/fairseq (https://github.com/pytorch/fairseq).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
trainer = flash.Trainer()
trainer.finetune(model, datamodule=datamodule)  

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type           | Params
-------------------------------------------------
0 | model         | Wav2Vec2ForCTC | 94.4 M
1 | train_metrics | ModuleDict     | 0     
2 | val_metrics   | ModuleDict     | 0     
3 | test_metrics  | ModuleDict     | 0     
-------------------------------------------------
94.4 M    Trainable params
0         Non-trainable params
94.4 M    Total params
377.585   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf fou

KeyError: 'labels'

In [20]:
checkpoint_path = "model_checkpoint.pt"

In [21]:
trainer.save_checkpoint(checkpoint_path)