In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

In [2]:
import pandas as pd
import numpy as np
import os
import torchaudio
import soundfile as sf
import subprocess
from transformers import AutoProcessor, AutoModelForCTC
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('transcripts.csv')

In [4]:
df

Unnamed: 0,SentenceID,Transcript,Device
0,speaker01_m_nn_utt01,aku libur sedina merga ana banjir,Samsung M15
1,speaker01_m_nn_utt02,aku pengin mangan dhawet nang omah,Samsung M15
2,speaker01_m_nn_utt03,aku kara adhiku mangan sega goreng,Samsung M15
3,speaker01_m_nn_utt04,aku tuku buku nang sekolahan,Samsung M15
4,speaker01_m_nn_utt05,aku mung duwe panuwun supaya anakku pinter lan...,Samsung M15
...,...,...,...
2095,speaker70_m_nn_utt26,Tulung priksa apa kabeh data wis cocog.,Laptop Asus Vivobook 15
2096,speaker70_m_nn_utt27,Kita lagi nglumpukake masukan saka para peserta.,Laptop Asus Vivobook 15
2097,speaker70_m_nn_utt28,Aku bakal nerusake gaweanku sawise ngaso sak b...,Laptop Asus Vivobook 15
2098,speaker70_m_nn_utt29,Aku lagi nyiapake jadwal kanggo dina iki.,Laptop Asus Vivobook 15


In [5]:
print(df.nunique())

SentenceID    2100
Transcript    2088
Device          47
dtype: int64


In [6]:
df['Transcript'] = [i.lower() for i in df['Transcript']]

In [7]:
train_df, val_df = train_test_split(
    df,
    test_size=0.05,
    random_state=42,
    shuffle=True
)

In [8]:
files = os.listdir('./../audio/audio_input')

In [14]:
for f in files:
    if f.endswith(".wav"):
        in_file = f"./../audio/audio_input/{f}"
        out_file = f"./../audio/audio_waved/{f}"

        cmd = [
            "ffmpeg", "-y",
            "-i", in_file,
            "-ac", "1",
            "-ar", "16000",
            "-sample_fmt", "s16",
            out_file,
        ]
        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    else:
        print(f)

In [10]:
for i in range(len(files)):
    os.rename(f"./../audio/audio_input/{files[i]}", f"./../audio/audio_input/{files[i].lower()}")

In [11]:
files = os.listdir('./../audio/audio_waved')

In [12]:
for f in files:
    audio, sr = torchaudio.load(f'./../audio/audio_waved/{f}')

    if audio.ndim > 1:
        audio = audio.mean(dim=0)
        
    audio = torchaudio.functional.resample(audio, sr, 16000)

    sf.write(f'./../audio/audio_preprocessed/{f}', audio.squeeze().numpy(), 16000, subtype='PCM_16', format='WAV')

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
model_name = "johaness14/wav2vec2-conformer-rope-jv-openslr"

processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForCTC.from_pretrained(model_name)

In [12]:
model.to(device)

Wav2Vec2ConformerForCTC(
  (wav2vec2_conformer): Wav2Vec2ConformerModel(
    (feature_extractor): Wav2Vec2ConformerFeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2ConformerLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2ConformerLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2ConformerLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2ConformerFeatureProjection(
      (layer_n

In [13]:
class ConformerDataset(Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]

        audio, sr = torchaudio.load(f"./../audio/audio_preprocessed/{row['SentenceID']}.wav")
        audio = torchaudio.functional.resample(audio, sr, 16000)

        inputs = self.processor(
            audio.squeeze().numpy(),
            sampling_rate=16000,
            text=row['Transcript'],
            return_tensors="pt",
            padding=True
        )

        return {
            "input_values": inputs["input_values"].squeeze(0),
            "labels": inputs["labels"].squeeze(0)
        }

In [14]:
train_dataset = ConformerDataset(train_df, processor)
val_dataset   = ConformerDataset(val_df, processor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=4)

In [15]:
from typing import Dict, List, Union
import torch

class DataCollatorCTCWithPadding:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(
        self, 
        features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:

        # Extract inputs and labels
        input_features = [{"input_values": f["input_values"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        # Pad inputs
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # Pad labels (separately)
        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt"
        )

        # Replace padding with -100 for the loss
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), 
            -100
        )
        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding='longest')

: 

In [None]:
training_args = TrainingArguments(
    output_dir="./conformer-test",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    dataloader_num_workers=0,
    fp16=True,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Epoch,Training Loss,Validation Loss
