In [None]:
!pip install transformers datasets torch torchaudio soundfile librosa

Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-p

In [None]:
import os
import glob
from datasets import Dataset, Audio

# Đường dẫn thư mục dữ liệu
data_dir = "/kaggle/input/asr-datatrain/vlsp2020_train_set_02"

# Lấy danh sách file .wav và .txt
wav_files = sorted(glob.glob(os.path.join(data_dir, "*.wav")))
txt_files = sorted(glob.glob(os.path.join(data_dir, "*.txt")))

# Ghép đôi file .wav và .txt
wav_basenames = [os.path.splitext(os.path.basename(f))[0] for f in wav_files]
txt_basenames = [os.path.splitext(os.path.basename(f))[0] for f in txt_files]
common_basenames = sorted(list(set(wav_basenames).intersection(set(txt_basenames))))

wav_files = [os.path.join(data_dir, f"{basename}.wav") for basename in common_basenames]
txt_files = [os.path.join(data_dir, f"{basename}.txt") for basename in common_basenames]

# Kiểm tra số lượng file
print("Số lượng file .wav:", len(wav_files))
print("Số lượng file .txt:", len(txt_files))
assert len(wav_files) == len(txt_files), "Số lượng file .wav và .txt không khớp!"

# Đọc nội dung file .txt
transcriptions = []
for txt_file in txt_files:
    with open(txt_file, "r", encoding="utf-8") as f:
        transcriptions.append(f.read().strip())

# Tạo dataset
data = {
    "file": wav_files,
    "text": transcriptions
}
dataset = Dataset.from_dict(data)

# Chuyển đổi cột "file" thành Audio
dataset = dataset.cast_column("file", Audio(sampling_rate=16000))


# Kiểm tra dataset
print(dataset[0])

In [None]:
# Lấy 1000 mẫu đầu tiên
dataset = dataset.select(range(35000))
print("Kích thước dataset thử nghiệm:", len(dataset))

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

def preprocess(batch):
    # batch["file"] là một list các dictionary
    input_values = []
    labels = []
    
    # Lặp qua từng mẫu trong batch
    for audio, text in zip(batch["file"], batch["text"]):
        # Xử lý âm thanh
        input_value = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
        input_values.append(input_value)
        # Xử lý văn bản
        label = processor.tokenizer(text).input_ids
        labels.append(label)
    
    # Gán kết quả vào batch
    batch["input_values"] = input_values
    batch["labels"] = labels
    return batch

# Áp dụng tiền xử lý
dataset = dataset.map(preprocess, remove_columns=["file", "text"], batched=True, batch_size=100)

# Chia tập train/test
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
print(dataset["train"])

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h", low_cpu_mem_usage=True)

In [None]:
import torch
from transformers import TrainingArguments, Trainer

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Kiểm tra GPU
print("GPU có sẵn:", torch.cuda.is_available())
print("Số lượng GPU:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Tên GPU:", torch.cuda.get_device_name(0))

# Chuyển mô hình sang GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Thiết bị của mô hình:", next(model.parameters()).device)

class DataCollatorCTCWithPadding:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, padding=self.padding, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Cấu hình huấn luyện
training_args = TrainingArguments(
    output_dir="/kaggle/working/wav2vec2-finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=6,
    num_train_epochs=5,
    learning_rate=3e-5,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    warmup_steps=500,
    save_total_limit=2,
    eval_strategy="steps",
    load_best_model_at_end=True,
    fp16=False,  # Sử dụng mixed precision trên GPU
    report_to="none",
    dataloader_num_workers=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=processor,  # Đã sửa từ tokenizer
    data_collator=data_collator,  # Thêm data collator
    
)

print("Bắt đầu huấn luyện...")
trainer.train()
print("Hoàn thành huấn luyện.")

In [None]:
trainer.evaluate()

In [None]:
model.save_pretrained("/kaggle/working/wav2vec2-finetuned")
processor.save_pretrained("/kaggle/working/wav2vec2-finetuned")

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import soundfile as sf
import torch

# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("/kaggle/working/wav2vec2-finetuned")
model = Wav2Vec2ForCTC.from_pretrained("/kaggle/working/wav2vec2-finetuned")

# define function to read in sound file
def map_to_array(batch):
    speech, _ = sf.read(batch["file"])
    batch["speech"] = speech
    return batch

# load dummy dataset and read soundfiles
ds = map_to_array({
    "file": '/kaggle/input/asr-datatrain/vlsp2020_train_set_02/database_sa1_Jan08_Mar19_cleaned_utt_0000000005-1.wav'
})

# tokenize
input_values = processor(ds["speech"], return_tensors="pt", padding="longest").input_values  # Batch size 1

# retrieve logits
logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)


In [None]:
print(transcription)