In [27]:
# !pip install -q datasets transformers evaluate huggingface_hub jiwer pythainlp

In [2]:
%pip install --upgrade torch torchaudio

Collecting torch
  Downloading torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchaudio
  Downloading torchaudio-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.5.1.17 (from torch)
  Downloading nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.6.4.1 (

In [3]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, multilabel_confusion_matrix

mcm = multilabel_confusion_matrix(y_true, y_pred)
for i, label_mcm in enumerate(mcm):
    print(f"Label {i+1}:")
    print(label_mcm)

references = predict_df['Gemini_Transcript ไว้แก้'].to_list()
hypothesis = predict_df['denoised_text'].to_list()
error_metrics = jiwer.compute_measures(references, hypothesis)
error_metrics['wer']

NameError: name 'y_true' is not defined

In [6]:
import os
import evaluate
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset
from datasets import load_dataset, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
)
from tqdm.notebook import tqdm
from pythainlp.tokenize import word_tokenize
from huggingface_hub import login
from dataclasses import dataclass
import string
from torch.utils.data import DataLoader

AttributeError: partially initialized module 'torchvision' has no attribute 'extension' (most likely due to a circular import)

In [3]:
os.environ["WANDB_MODE"] = "offline"

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
lang = "th"
task = "transcribe"
model_name = "biodatlab/whisper-th-medium-combined"

## Load Model

In [6]:
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.generation_config.language = lang
model.generation_config.task = task
model.generation_config.forced_decoder_ids = None
model = model.to(device)

In [7]:

class CustomDataset(Dataset):
    def __init__(self, 
        csv_file: str, 
        audio_folder: str, 
        target_sr: int = 16000,
        feature_extractor: WhisperFeatureExtractor = None,
        tokenizer: WhisperTokenizer = None
    ):
        super(CustomDataset, self).__init__()
        self.audio_folder = audio_folder
        self.target_sr = target_sr
        self.csv_file = pd.read_csv(csv_file)
        self.csv_file['audio'] = self.csv_file['audio'].apply(
            lambda x: os.path.join(audio_folder, x)
        )
        self.csv_file['sentence'] = self.csv_file['sentence'].apply(
            lambda txt: txt.translate(str.maketrans('', '', string.punctuation))
        )
        
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor

    def __len__(self):
        return self.csv_file.shape[0]
    
    def __getitem__(self, idx):
        try:
            audio_path, sentence = self.csv_file.iloc[idx]
            
            # Try loading with torchaudio
            try:
                waveform, sampling_rate = torchaudio.load(audio_path)
            except:
                # If torchaudio fails, try using soundfile
                import soundfile as sf
                waveform, sampling_rate = sf.read(audio_path)
                waveform = torch.from_numpy(waveform).T
                if len(waveform.shape) == 1:
                    waveform = waveform.unsqueeze(0)
            
            if sampling_rate != self.target_sr:
                resampler = torchaudio.transforms.Resample(sampling_rate, self.target_sr)
                waveform = resampler(waveform)
                sampling_rate = self.target_sr
            
            array = waveform.detach().numpy().flatten()    

            input_features = self.feature_extractor(array, sampling_rate=sampling_rate).input_features[0]
            labels = self.tokenizer(sentence).input_ids

            return dict(
                input_features=input_features,
                labels=labels
            )
        except Exception as e:
            print(f"Error loading {audio_path}: {str(e)}")
            # Return a default/empty sample
            return dict(
                input_features=torch.zeros((80, 3000)),
                labels=self.tokenizer("").input_ids
            )

In [8]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name, language=lang, task=task)

In [9]:
processor = WhisperProcessor.from_pretrained(model_name, language=lang, task=task)

## PATH DATA FILE CSV+AUDIO_FOLDER

In [9]:
datasets = {
    "train": CustomDataset(
        csv_file="/data/502507_pre/502507_pre/hcu-speech-recognition-challenge-2025/train.csv",
        audio_folder="/data/502507_pre/502507_pre/hcu-speech-recognition-challenge-2025/train",
        feature_extractor=feature_extractor,
        tokenizer=tokenizer,
    ),  
    "dev": CustomDataset(
        csv_file="/data/502507_pre/502507_pre/hcu-speech-recognition-challenge-2025/dev.csv",
        audio_folder="/data/502507_pre/502507_pre/hcu-speech-recognition-challenge-2025/dev",
        feature_extractor=feature_extractor,
        tokenizer=tokenizer,
    )
}

## Data Collator


In [10]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:

    def __init__(self, processor, decoder_start_token_id):
        self.processor = processor
        self.decoder_start_token_id = decoder_start_token_id

    def __call__(self, features):

        # Prepare input features by padding and converting to tensor
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Prepare labels by padding and converting to tensor
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Mask padding tokens in labels
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Remove the decoder start token
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

In [11]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## Eval Metrics CER

In [12]:
metric = evaluate.load("cer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

## Fine-tuning

In [13]:
from accelerate import Accelerator

In [14]:
# Add before trainer initialization
accelerator = Accelerator()
model = accelerator.prepare(model)

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/data/Chuniji/texttospeech/outss2",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=1.0,
    learning_rate=9e-6,
    gradient_checkpointing=True,
    fp16=True,
    bf16=False,
    optim="adamw_torch_fused",
    eval_strategy="epoch",
    per_device_eval_batch_size=1,
    predict_with_generate=True,
    generation_max_length=64,
    save_strategy="epoch",
    save_total_limit=5,
    logging_steps=100,
    report_to=None,
    push_to_hub=False,
    # Add these lines:
    no_cuda=False if torch.cuda.is_available() else True,
    ddp_find_unused_parameters=False,
    local_rank=-1
)

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=datasets['train'],
    eval_dataset=datasets['dev'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor
)
trainer = accelerator.prepare(trainer)

  trainer = Seq2SeqTrainer(


In [17]:
processor.save_pretrained(training_args.output_dir)

[]

In [18]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Epoch,Training Loss,Validation Loss,Cer
1,No log,0.084174,22.163196


Error loading /data/502507_pre/502507_pre/hcu-speech-recognition-challenge-2025/train/650020_AUG_1088.mp3: Error opening '/data/502507_pre/502507_pre/hcu-speech-recognition-challenge-2025/train/650020_AUG_1088.mp3': System error.


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


TrainOutput(global_step=50, training_loss=0.3283563232421875, metrics={'train_runtime': 453.7785, 'train_samples_per_second': 3.524, 'train_steps_per_second': 0.11, 'total_flos': 1.63194724712448e+18, 'train_loss': 0.3283563232421875, 'epoch': 1.0})

In [46]:
# !apt-get install -y libsndfile1
# %pip install soundfile

## Inference

In [None]:
checkpoints = []
for root, dirs, files in os.walk("/data/Chuniji/texttospeech/outss2"):
    for d in sorted(dirs):
        if 'checkpoint' in d:
            checkpoints.append(d)
            print(d)
print(checkpoints)

checkpoint-50
checkpoint-525
checkpoint-850
['checkpoint-50', 'checkpoint-525', 'checkpoint-850']


In [None]:
dataset_folder_path="/data/week4data/cleaned_test"
# sub_df=pd.read_csv("/kaggle/input/dsi-443-tu-speech-recognition-challenge/datasets/sample_submission.csv")
test_df=pd.read_csv("/data/502507_pre/502507_pre/week4/test_id.csv")

In [None]:
def cut_word(txt):
    txt.replace(" ", "")
    txt = txt.strip()
    txt = " ".join(word_tokenize(text=txt, engine="newmm"))
    return txt

In [None]:
# %pip install --upgrade transformers

In [None]:
# %pip install --upgrade transformers torch torchaudio
# %pip install -U accelerate
# %pip install --force-reinstall transformers

In [None]:
# Get the last checkpoint path
last_checkpoint_path = os.path.join(training_args.output_dir, checkpoints[-1]) if checkpoints else model_name

# Load model and tokenizer directly
model = WhisperForConditionalGeneration.from_pretrained(last_checkpoint_path)

## TEST CHUNK

In [1]:
from transformers import WhisperTokenizer, pipeline, WhisperForConditionalGeneration
import torch
import os

# Setup device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set your local model directory
base_model_path = "/data/Chuniji/model"

# Load model and tokenizer directly
model = WhisperForConditionalGeneration.from_pretrained(last_checkpoint_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name, language=lang, task=task)

# Move model to device
model = model.to(device)

# Create pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=15,
    device=0 if device == "cuda" else -1
)

NameError: name 'last_checkpoint_path' is not defined

In [None]:
pipe.model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1024)
      (layers): ModuleList(
        (0-23): 24 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias

## TEST CHUNK

In [None]:
text = pipe('/data/week4data/audio-understanding/speechs/speechs/train/d76f8990-f74f-4f11-8bc1-ada4928725ad.wav')["text"]
text



'ดิฉันชื่อวิภดา หรือเรียกว่าวิวก็ได้ค่ะ เป็นที่ปรึกษาจากธนาคารไทยพาณิชย์ วันนี้ได้รับมอบหมายให้เข้ามาพูดคุยกับคุณผชน เกี่ยวกับการปรับเปลี่ยนแผนการเงินบางส่วน เพื่อให้การบริหารพอร์ตของคุณาชนเหมาะสมกับเป้าหมายในระยะยาวมากขึ้น ค่ะ ในการพูดคุยครั้งนี้ ดิฉันจะช่วยชี้แนะแนวทางในการลดความเสี่ยง รวมถึงเพิ่มโอกาสในการสร้างผลตอบแทนที่ดียิ่งขึ้น ซึ่งทั้งหมดนี้จะอิงกับข้อมูลการลงทุนของคุณพชรพลครับ นอกจากนี้ยังมีข้อมูลอัปเดตที่สำคัญจากสถานการณ์ตลาดที่ดิฉันเตรียมมา เพื่อช่วยให้การตัดสินใจของคุณแม่นยำยิ่งขึ้นค่ะ ดิฉันขอเวลา 20 นาทีสำหรับการพูดคุยครั้งนี้ แต่พยายามไม่ใช้เวลานานจนเกินไป หมายทราบว่าคุณเภชรพลสะดวกที่จะพูดคุยในช่วงนี้หรือยังคะ'

In [None]:
# %pip install -U ipywidgets

In [None]:
# %pip install -U ipywidgets jupyter
# !jupyter nbextension enable --py widgetsnbextension

## TRANSCIRP PART

In [None]:
from tqdm import tqdm  # Change this import

pipe.model.eval()
for i in tqdm(range(len(test_df)), desc="Processing audio files"):
    audio_path = os.path.join(dataset_folder_path, test_df.iloc[i]['audio'])
    with torch.no_grad():  
        text = pipe(audio_path)["text"]
        # text = cut_word(text)
    test_df.at[i, "sentence"] = text

Processing audio files: 100%|██████████| 300/300 [33:58<00:00,  6.80s/it]


In [None]:
test_df

Unnamed: 0,audio,sentence
0,2e378ce7-b5f5-49b7-a337-0bc64b1115cd.wav,เม้นท์มาคุยกับคุณวัณนิดาในวันนี้เพื่ออยากจะแนะ...
1,70e43e4b-42f2-40f8-9822-2ec5ee0c94d4.wav,ดิฉันชื่อจิตาเป็นที่ปรึกษาทางการเงินค่ะก่อนเริ...
2,b1b69e87-0a65-4aac-9eeb-910328f8bb1f.wav,ผมปั๊บครับ เป็นที่ปรึกษาทางการเงินแก่ธนาคารไทย...
3,292e0419-aed1-43d0-aa60-9635dc80574b.wav,สวัสดีค่ะ คุณบุญธิชา ดิฉันชื่อบุนดา หรือเรียกว...
4,51f656f7-7558-4e4b-ab0a-fa91a772d196.wav,บุหรี่ชั้นชื่อพัทธ์ ตรา อิน ท ว ท น เป็นที่ปรึ...
...,...,...
295,dd74a676-a365-42ee-8793-89dc1b732596.wav,ชื่อ นายสมบัติ บุญการจน หรือเรียกสั้นๆว่า บัฏ ...
296,4ee7be59-b578-48a1-9b2d-74e5e18d845d.wav,สวัสดีค่ะ คุณวิชากร ดิชั้นชื่อ นางสาวรวีวัณย์ ...
297,e7089064-4bfa-42d6-8f53-abfded8653b8.wav,สวัสดีครับ สวัสดีครับ สวัสดีครับ สวัสดีครับ สว...
298,a43136f2-aed3-41d3-9921-9abbdd5c6f8e.wav,ชื่อ นายอิทธิมล สวัสดิคุณ หรือเรียกผมว่า มิ้น ...


In [None]:
test_df.to_csv("test_text.csv", index=False)

In [None]:
trainer.save_model("/data/Chuniji/texttospeech/final_model")