# **Setup**

In [None]:
!pip install datasets transformers torchaudio accelerate librosa evaluate jiwer

# **Load data**

In [None]:
import os
import json

def create_manifest(folder_path, output_jsonl, max_samples=None):
    samples = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            base_name = filename[:-4]
            wav_path = os.path.join(folder_path, filename)
            txt_path = os.path.join(folder_path, base_name + ".txt")
            if os.path.exists(txt_path):
                with open(txt_path, "r", encoding="utf-8") as f:
                    transcription = f.read().strip()
                samples.append({
                    "audio": wav_path,
                    "transcription": transcription
                })
            if max_samples and len(samples) >= max_samples:
                break

    with open(output_jsonl, "w", encoding="utf-8") as f:
        for sample in samples:
            json.dump(sample, f, ensure_ascii=False)
            f.write("\n")

create_manifest("/kaggle/input/train1-asr/audio_short", "train.jsonl")
create_manifest("/kaggle/input/test-asr/test", "test.jsonl", max_samples=1000)


In [None]:
from datasets import load_dataset, Audio

train_data = load_dataset("json", data_files="train.jsonl", split="train")
test_data = load_dataset("json", data_files="test.jsonl", split="train")

train_data = train_data.cast_column("audio", Audio())
test_data = test_data.cast_column("audio", Audio())


In [None]:
train_data = train_data.map(prepare_dataset)
test_data = test_data.map(prepare_dataset)

# **Setup model/load model** 

In [None]:
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration, TrainingArguments, Seq2SeqTrainer
import torch

# model_name = "openai/whisper-small"

# processor = WhisperProcessor.from_pretrained(model_name)
# tokenizer = processor.tokenizer
# feature_extractor = processor.feature_extractor
# model = WhisperForConditionalGeneration.from_pretrained(model_name)
model_path = "/kaggle/input/train1_asr/transformers/default/1/checkpoint-419"

# Load processor và model từ thư mục local
processor = WhisperProcessor.from_pretrained(model_path)
tokenizer = processor.tokenizer
feature_extractor = processor.feature_extractor
model = WhisperForConditionalGeneration.from_pretrained(model_path)
# Xử lý dữ liệu đầu vào
def prepare_dataset(batch):
    audio = batch["audio"]

    # Lấy đặc trưng đầu vào
    inputs = processor.feature_extractor(audio["array"], sampling_rate=16000)
    batch["input_features"] = inputs["input_features"][0]

    # Encode target text
    batch["labels"] = processor.tokenizer(
        batch["transcription"],
        padding="max_length",
        max_length=128,
        truncation=True
    ).input_ids

    return batch






In [None]:
import numpy as np
import evaluate

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    if isinstance(pred_ids, tuple):  # Nếu là logits
        pred_ids = np.argmax(pred_ids[0], axis=-1)

    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids.tolist(), skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids.tolist(), skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}





In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Định nghĩa training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetune1",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    fp16=True,  
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=1e-5,
    warmup_steps=500,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    disable_tqdm=False ,
    report_to=[],  
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer =processor.feature_extractor,
    compute_metrics=compute_metrics,
)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


# **Train**

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

create_manifest("/kaggle/input/tran2-asr/audio_short", "train.jsonl")
train_data1 = load_dataset("json", data_files="train.jsonl", split="train")
train_data1 = train_data1.cast_column("audio", Audio())
train_data1 = train_data1.map(prepare_dataset)

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetune1",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    fp16=True,  
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=1e-5,
    warmup_steps=500,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    disable_tqdm=False ,
    report_to=[],  
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_data1,
    eval_dataset=test_data,
    tokenizer =processor.feature_extractor,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
create_manifest("/kaggle/input/train3-asr/data_fillter1/audio_short", "train.jsonl")
train_data2 = load_dataset("json", data_files="train.jsonl", split="train")
train_data2 = train_data2.cast_column("audio", Audio())
train_data2 = train_data2.map(prepare_dataset)

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetune2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    fp16=True,  
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    learning_rate=1e-5,
    warmup_steps=500,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    disable_tqdm=False ,
    report_to=[],  
)

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_data2,
    eval_dataset=test_data,
    tokenizer =processor.feature_extractor,
    compute_metrics=compute_metrics,
)
trainer.train()

# **Gen transcripts**

In [None]:
import os
import torch
import torchaudio
from transformers import GenerationConfig
from transformers import WhisperForConditionalGeneration, AutoProcessor

def predict_audio(audio_path, model, processor, sampling_rate=16000, device="cuda" if torch.cuda.is_available() else "cpu"):
    waveform, sr = torchaudio.load(audio_path)

    if sr != sampling_rate:
        resampler = torchaudio.transforms.Resample(sr, sampling_rate)
        waveform = resampler(waveform)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    input_features = processor.feature_extractor(
        waveform.squeeze().numpy(),
        sampling_rate=sampling_rate,
        return_tensors="pt"
    ).input_features.to(device)

    model = model.to(device)

    if hasattr(model.config, 'forced_decoder_ids'):
        model.config.forced_decoder_ids = None
    if hasattr(model, 'generation_config') and hasattr(model.generation_config, 'forced_decoder_ids'):
        model.generation_config.forced_decoder_ids = None

    model.generation_config = GenerationConfig(
        max_length=128,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        decoder_start_token_id=model.config.decoder_start_token_id,
        use_cache=False
    )

    with torch.no_grad():
        predicted_ids = model.generate(input_features)

    transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    return transcription

if __name__ == "__main__":
    model_name = "/kaggle/input/asr_final/transformers/default/1"
    processor = AutoProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name)

    input_dir = "/kaggle/input/asr-data/private_test/private-test-data-asr"
    output_file = "/kaggle/working/predictions1.txt"

    with open(output_file, "w", encoding="utf-8") as out_f:
        for filename in sorted(os.listdir(input_dir)):
            if filename.endswith((".wav", ".mp3")):
                file_path = os.path.join(input_dir, filename)

                # Nếu là .mp3 thì convert sang .wav tạm thời
                if filename.endswith(".mp3"):
                    tmp_path = "/kaggle/working/temp.wav"
                    try:
                        waveform, sr = torchaudio.load(file_path)
                        torchaudio.save(tmp_path, waveform, sr)
                        audio_path = tmp_path
                    except Exception as e:
                        print(f"✗ Error converting {filename}: {e}")
                        continue
                else:
                    audio_path = file_path

                try:
                    transcription = predict_audio(audio_path, model, processor)
                    audio_id = os.path.splitext(filename)[0]
                    out_f.write(f"{audio_id} {transcription.strip()}\n")
                    print(f"✓ {audio_id}")
                except Exception as e:
                    print(f"✗ Error processing {filename}: {e}")

        temp_path = "/kaggle/working/temp.wav"
        if os.path.exists(temp_path):
            os.remove(temp_path)

# **Sử dụng LLM** 

In [None]:

!pip install requests
!pip install google-generativeai
 
import requests
import json
 
API_KEY = "AIzaSyAPqOHiASbvA8PKRYS8QhSnfK_idUNu0Ww"
URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={API_KEY}"
 
def get_gemini_response(question):
    headers = {
        "Content-Type": "application/json"
    }
 
    data = {
        "contents": [
            {
                "parts": [
                    {"text": question}
                ]
            }
        ]
    }
 
    response = requests.post(URL, headers=headers, json=data)
 
    if response.status_code == 200:
        result = response.json()
        answer = result["candidates"][0]["content"]["parts"][0]["text"]
        return answer
    else:
        return f"Lỗi: {response.status_code} - {response.text}"
 
cau_ban_dau = "Hôm qua tui đi New York chơi, gặp thằng bạn từ Washington qua, nó rủ đi ăn burger ở McDonald's"
 
prompt = f"""Viết lại câu cho tôi loại bỏ những lỗi chính tả.
              Câu ban đầu là: {cau_ban_dau}.viết lại cho tôi theo yêu cầu trên"""
 
answer = get_gemini_response(prompt)
print("Câu trả lời:", answer)