In [1]:
!git clone https://github.com/khanld/chunkformer.git

Cloning into 'chunkformer'...
remote: Enumerating objects: 94, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 94 (delta 45), reused 33 (delta 33), pack-reused 30 (from 1)[K
Receiving objects: 100% (94/94), 1.30 MiB | 10.53 MiB/s, done.
Resolving deltas: 100% (46/46), done.


In [2]:
%cd chunkformer
!pip install -r requirements.txt

/kaggle/working/chunkformer
Collecting textgrid (from -r requirements.txt (line 4))
  Downloading TextGrid-1.6.1.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jiwer (from -r requirements.txt (line 6))
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer->-r requirements.txt (line 6))
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: textgrid
  Building wheel for textgrid (setup.py) ... [?25l[?25hdone
  Created wheel for textgrid: filename=TextGrid-1.6.1-py3-none-any.whl size=10146 sha256=46b221a47a5cc1cfb4950a9bd697156d1b31aa3279b0e

In [3]:
!git lfs install
!git clone https://huggingface.co/khanhld/chunkformer-large-vie

Updated git hooks.
Git LFS initialized.
Cloning into 'chunkformer-large-vie'...
remote: Enumerating objects: 235, done.[K
remote: Counting objects: 100% (231/231), done.[K
remote: Compressing objects: 100% (228/228), done.[K
remote: Total 235 (delta 139), reused 0 (delta 0), pack-reused 4 (from 1)[K
Receiving objects: 100% (235/235), 753.81 KiB | 7.11 MiB/s, done.
Resolving deltas: 100% (139/139), done.


In [4]:
ls

[0m[01;34mchunkformer-large-vie[0m/  decode.py  [01;34mmodel[0m/     requirements.txt
[01;34mdata[0m/                   [01;34mdocs[0m/      README.md


In [5]:
import os
import sys
import torch
import torchaudio
import yaml
from pathlib import Path
from typing import List, Dict

# Add chunkformer to Python path
if "__file__" in globals():
    # Trường hợp chạy script .py
    BASE_DIR = Path(__file__).parent
else:
    # Trường hợp chạy trong notebook (kaggle/jupyter)
    BASE_DIR = Path(os.getcwd())

CHUNKFORMER_DIR = BASE_DIR / "chunkformer"
sys.path.append(str(CHUNKFORMER_DIR))

# Now imports should work
from model.utils.init_model import init_model
from model.utils.checkpoint import load_checkpoint
from model.utils.file_utils import read_symbol_table
from model.utils.ctc_utils import get_output_with_timestamps
import torchaudio.compliance.kaldi as kaldi
from pydub import AudioSegment

class SpeechDecoder:
    def __init__(self, model_dir: str, device: str = None):
        self.model_dir = Path(model_dir)
        self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
        self.model, self.char_dict = self._init_model()

    def _init_model(self):
        """Khởi tạo model từ checkpoint"""
        config_path = self.model_dir / "config.yaml"
        checkpoint_path = self.model_dir / "pytorch_model.bin"
        symbol_table_path = self.model_dir / "vocab.txt"

        # Load config
        with open(config_path, 'r', encoding='utf-8') as f:
            config = yaml.safe_load(f)

        # Initialize model
        model = init_model(config, str(config_path))
        model.eval()
        
        # Load checkpoint
        load_checkpoint(model, str(checkpoint_path))
        
        # Move to device
        model.encoder = model.encoder.to(self.device)
        model.ctc = model.ctc.to(self.device)

        # Load vocabulary
        symbol_table = read_symbol_table(str(symbol_table_path))
        char_dict = {v: k for k, v in symbol_table.items()}

        return model, char_dict

    def load_audio(self, audio_path: str) -> torch.Tensor:
        """Load and preprocess audio file"""
        audio = AudioSegment.from_file(audio_path)
        audio = audio.set_frame_rate(16000)
        audio = audio.set_sample_width(2)
        audio = audio.set_channels(1)
        waveform = torch.as_tensor(audio.get_array_of_samples(), dtype=torch.float32).unsqueeze(0)
        return waveform

    @torch.no_grad()
    def decode_audio(self, audio_path: str, 
                    chunk_size: int = 64,
                    left_context: int = 128,
                    right_context: int = 128,
                    batch_duration: int = 3600) -> List[Dict]:
        """
        Decode audio file và trả về list các đoạn text với timestamp
        Returns:
            List[Dict]: List of dicts with keys 'start', 'end', 'text'
        """
        # Load audio
        waveform = self.load_audio(audio_path)
        
        # Extract features
        features = kaldi.fbank(
            waveform,
            num_mel_bins=80,
            frame_length=25,
            frame_shift=10,
            dither=0.0,
            energy_floor=0.0,
            sample_frequency=16000
        ).unsqueeze(0)

        # Model config
        subsampling_factor = self.model.encoder.embed.subsampling_factor
        conv_kernel = self.model.encoder.cnn_module_kernel // 2
        max_length = int((batch_duration // 0.01)) // 2

        # Initialize caches
        multiply_n = max_length // chunk_size // subsampling_factor
        truncated_size = chunk_size * multiply_n
        rel_right_size = (right_context + max(chunk_size, right_context) * 
                         (self.model.encoder.num_blocks-1)) * subsampling_factor

        # Process audio in chunks
        hyps = []
        offset = torch.zeros(1, dtype=torch.int, device=self.device)
        att_cache = torch.zeros(
            (self.model.encoder.num_blocks, left_context, 
             self.model.encoder.attention_heads,
             self.model.encoder._output_size * 2 // self.model.encoder.attention_heads)
        ).to(self.device)
        cnn_cache = torch.zeros(
            (self.model.encoder.num_blocks, 
             self.model.encoder._output_size, 
             conv_kernel)
        ).to(self.device)

        # Process chunks
        for idx in range(0, features.shape[1], truncated_size * subsampling_factor):
            start = max(truncated_size * subsampling_factor * idx, 0)
            end = min(truncated_size * subsampling_factor * (idx+1) + 7, features.shape[1])

            x = features[:, start:end+rel_right_size]
            x_len = torch.tensor([x[0].shape[0]], dtype=torch.int).to(self.device)

            # Forward pass
            encoder_out, encoder_len, _, att_cache, cnn_cache, offset = (
                self.model.encoder.forward_parallel_chunk(
                    xs=x,
                    xs_origin_lens=x_len,
                    chunk_size=chunk_size,
                    left_context_size=left_context,
                    right_context_size=right_context,
                    att_cache=att_cache,
                    cnn_cache=cnn_cache,
                    truncated_context_size=truncated_size,
                    offset=offset
                )
            )

            # Process output
            encoder_out = encoder_out.reshape(1, -1, encoder_out.shape[-1])[:, :encoder_len]
            if chunk_size * multiply_n * subsampling_factor * idx + rel_right_size < features.shape[1]:
                encoder_out = encoder_out[:, :truncated_size]

            hyp = self.model.encoder.ctc_forward(encoder_out).squeeze(0)
            hyps.append(hyp)

            if chunk_size * multiply_n * subsampling_factor * idx + rel_right_size >= features.shape[1]:
                break

        # Get final output
        hyps = torch.cat(hyps)
        results = get_output_with_timestamps([hyps], self.char_dict)[0]
        
        # Format results
        output = []
        for item in results:
            output.append({
                'start': item['start'],
                'end': item['end'],
                'text': item['decode']
            })
            
        return output

In [6]:
import os
import subprocess
import torch
from pathlib import Path
import re
import csv
import sys

class ChunkformerProcessor:
    def __init__(self, repo_dir: str, model_dir: str, device: str = None):
        """
        repo_dir: Thư mục chứa code chunkformer (đã git clone)
        model_dir: Thư mục chứa model chunkformer-large-vie (đã git lfs clone)
        device: 'cuda' hoặc 'cpu'. Nếu None sẽ tự động chọn.
        """
        self.repo_dir = Path(repo_dir)
        self.model_dir = Path(model_dir)
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.decoder = SpeechDecoder(model_dir, device)

        if not self.repo_dir.exists():
            raise FileNotFoundError(f"Repo directory not found: {self.repo_dir}")
        if not self.model_dir.exists():
            raise FileNotFoundError(f"Model directory not found: {self.model_dir}")

    def convert_to_wav(self, input_path: str, output_path: str, sample_rate: int = 16000):
        """Dùng ffmpeg để chuyển audio/video sang wav mono"""
        cmd = [
            "ffmpeg", "-y", "-i", input_path,
            "-ar", str(sample_rate),
            "-ac", "1",
            output_path
        ]
        subprocess.run(cmd, check=True)

    def transcribe(self, audio_path: str,
                   total_batch_duration: int = 3600,
                   chunk_size: int = 64,
                   left_context: int = 128,
                   right_context: int = 128):
        """
        Gọi decode.py để nhận transcript.
        """
        current_dir = os.getcwd()
        try:
            os.chdir(self.repo_dir)
            
            # Set PYTHONIOENCODING environment variable
            my_env = os.environ.copy()
            my_env["PYTHONIOENCODING"] = "utf-8"
            
            cmd = [
                sys.executable,
                "decode.py",
                "--model_checkpoint", str(self.model_dir),
                "--long_form_audio", str(Path(audio_path).absolute()),
                "--total_batch_duration", str(total_batch_duration),
                "--chunk_size", str(chunk_size),
                "--left_context_size", str(left_context),
                "--right_context_size", str(right_context)
            ]
            
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                universal_newlines=True,
                encoding='utf-8',
                env=my_env
            )
            
            stdout, stderr = process.communicate()
            
            if process.returncode != 0:
                print("Error output:", stderr)
                raise subprocess.CalledProcessError(process.returncode, cmd)
                
            return stdout
            
        finally:
            os.chdir(current_dir)

    def transcribe_to_csv(self, audio_path: str, csv_path: str,
                          total_batch_duration: int = 3600,
                          chunk_size: int = 64,
                          left_context: int = 128,
                          right_context: int = 128):
        """
        Chạy transcribe và lưu kết quả ra CSV.
        """
        results = self.decoder.decode_audio(
            audio_path,
            chunk_size=chunk_size,
            left_context=left_context,
            right_context=right_context,
            batch_duration=total_batch_duration
        )

        with open(csv_path, "w", newline='', encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["start_time", "end_time", "text"])
            for item in results:
                writer.writerow([item['start'], item['end'], item['text']])

        print(f"✅ Transcript đã lưu vào {csv_path} ({len(results)} dòng)")


    def get_fps(self, video_path: str) -> float:
        """Lấy FPS của video bằng ffprobe"""
        cmd = [
            "ffprobe", "-v", "0", "-of", "csv=p=0",
            "-select_streams", "v:0", "-show_entries", "stream=r_frame_rate",
            video_path
        ]
        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        fps_str = result.stdout.strip()
        if fps_str and "/" in fps_str:
            num, den = map(int, fps_str.split("/"))
            return num / den
        return float(fps_str)

    def extract_audio_segment(self, video_path: str, frame_idx: int, output_wav: str, margin: int = 10, fps: float = None):
        """
        Cắt audio từ video quanh frame_idx, lấy trước và sau 'margin' giây.
        """
        if fps is None:  # fallback nếu chưa có fps
            fps = self.get_fps(video_path)
    
        center_time = frame_idx / fps
        start_time = max(center_time - margin, 0)
        duration = margin * 2
    
        cmd = [
            "ffmpeg", "-y", "-i", video_path,
            "-ss", str(start_time),
            "-t", str(duration),
            "-ar", "16000", "-ac", "1",
            output_wav
        ]
        subprocess.run(cmd, check=True)
    
        return start_time, start_time + duration
    
    
    def transcribe_frame(self, video_path: str, frame_idx: int,
                         margin: int = 10,
                         total_batch_duration: int = 3600,
                         chunk_size: int = 64,
                         left_context: int = 128,
                         right_context: int = 128,
                         fps: float = None):
        """
        Trích xuất text quanh frame_idx (± margin giây).
        """
        temp_wav = "temp_segment.wav"
        start_time, end_time = self.extract_audio_segment(video_path, frame_idx, temp_wav, margin=margin, fps=fps)
    
        results = self.decoder.decode_audio(
            temp_wav,
            chunk_size=chunk_size,
            left_context=left_context,
            right_context=right_context,
            batch_duration=total_batch_duration
        )
    
        return {
            "frame": frame_idx,
            "time_range": (start_time, end_time),
            "results": results
        }

In [7]:
import os
print(os.getcwd())

/kaggle/working/chunkformer


In [8]:
processor = ChunkformerProcessor(
    repo_dir=".",   # thư mục code
    model_dir="./chunkformer-large-vie",  # thư mục model
    device="cuda"
)

video_path = "/kaggle/input/video-demo/vtv24.mp4" 

fps = processor.get_fps(video_path)

key_frames = [1771, 1872, 2033, 2194, 2332]

results = {}
for frame in key_frames:
    output = processor.transcribe_frame(video_path, frame, margin=10)
    results[frame] = output

# In kết quả
for frame, data in results.items():
    print(f"\n=== Frame {frame} | Time range: {data['time_range']} ===")
    for seg in data["results"]:
        print(f"[{seg['start']} - {seg['end']}] {seg['text']}")

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab


=== Frame 1771 | Time range: (60.84, 80.84) ===
[00:00:00:000 - 00:00:19:760]  có khả năng khử màu nước nhuộm đầu tiên là tái chế được ra li thường từ pin cũ sau đó thì đã tạo ra được một loại vật liệu mới và có thể ứng dụng vào thực tế bọn em cũng đã có khoảng thời gian đi thực nghiệm tại làng lụa vạn phúc ở hà đông hà nội thì chúng em cũng đã ờ xin được mẫu nước về và sau đó tự thử nghiệm

=== Frame 1872 | Time range: (64.88, 84.88) ===
[00:00:00:000 - 00:00:19:760]  a li thường từ pin cũ sau đó thì đã tạo ra được một loại vật liệu mới và có thể ứng dụng vào thực tế bọn em cũng đã có khoảng thời gian đi thực nghiệm tại làng lụa vạn phúc ở hà đông hà nội vâng thì chúng em cũng đã ờ xin được mẫu nước về và sau đó tự thử nghiệm vận động của chúng em đã xử lý được màu xanh của các cá

=== Frame 2033 | Time range: (71.32, 91.32) ===
[00:00:00:000 - 00:00:19:760]  có khoảng thời gian đi thực nghiệm tại làng lụa vạn phúc ở hà đông hà nội vâng thì chúng em cũng đã xin được mẫu nước về và sa

size=     625kB time=00:00:19.97 bitrate= 256.3kbits/s speed=  93x    
video:0kB audio:625kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.012187%
