# Audio Extract

In [None]:
%%capture
!pip install pyannote.audio==3.3.1
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install pyctcdecode==v0.1.0

In [None]:
import os
import torch
import glob
import json
from tqdm import tqdm
from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", 
                                        use_auth_token="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")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline=pipeline.to(device)

In [None]:
import os
import json
import zipfile
import kenlm
import torch
import librosa    
from tqdm import tqdm
import soundfile as sf
from huggingface_hub import hf_hub_download
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel

In [None]:
import os

def delete_wav_file(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".wav"):
                file_path = os.path.join(root, file)
                try:
                    os.remove(file_path)
                    print(f"Deleted: {file_path}")
                except Exception as e:
                    print(f"Delete error: {file_path}. Error: {e}")

In [None]:
# Get model
def get_decoder_ngram_model(tokenizer, ngram_lm_path):
    vocab_dict = tokenizer.get_vocab()
    sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
    vocab = [x[1] for x in sort_vocab][:-2]
    vocab_list = vocab
    # convert ctc blank character representation
    vocab_list[tokenizer.pad_token_id] = ""
    # replace special characters
    vocab_list[tokenizer.unk_token_id] = ""
    # convert space character representation
    vocab_list[tokenizer.word_delimiter_token_id] = " "
    # specify ctc blank char index, since conventially it is the last entry of the logit matrix
    alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
    lm_model = kenlm.Model(ngram_lm_path)
    decoder = BeamSearchDecoderCTC(alphabet,
                                   language_model=LanguageModel(lm_model))
    return decoder

In [None]:
# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
lm_file = hf_hub_download("nguyenvulebinh/wav2vec2-base-vietnamese-250h", filename='vi_lm_4grams.bin.zip')
with zipfile.ZipFile(lm_file, 'r') as zip_ref:
    zip_ref.extractall('./Data')
ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, 'Data/vi_lm_4grams.bin')

In [None]:
!rm -rf Data

In [None]:
# Audio recognition
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
save_dir_all = '/kaggle/working/audio_recognized'
if not os.path.exists(save_dir_all):
    os.mkdir(save_dir_all)

In [None]:
import os
import sys
import subprocess
from tqdm import tqdm

def convert_video_to_audio_ffmpeg(video_file, save_path, output_ext="wav"):
    """Converts video to audio directly using `ffmpeg` command
    with the help of subprocess module"""
    subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{save_path}.{output_ext}"], 
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.STDOUT)
    

def extraction(video_dir, save_dir, save_recog_dir):
    all_video_paths = dict()
        
    for part in sorted(os.listdir(video_dir)):
        parts = part.split('_')
        data_part = parts[-2] + "_" + parts[-1]
        if "L26" in parts:
            continue
        if "L25" in parts:
            continue
        all_video_paths[data_part] =  dict()
    
    for data_part in sorted(all_video_paths.keys()):
        data_part_path = f'{video_dir}/Videos_{data_part}/video'
        video_paths = sorted(os.listdir(data_part_path))
        video_ids = [video_path.replace('.mp4', '').split('_')[-1] for video_path in video_paths]
        for video_id, video_path in zip(video_ids, video_paths):
            video_path_full = f'{data_part_path}/{video_path}'
            all_video_paths[data_part][video_id] = video_path_full

    save_dir_all = f'{save_dir}/audio_extracted'

    if not os.path.exists(save_dir_all):
        os.mkdir(save_dir_all)
        
    for key in tqdm(all_video_paths.keys()):
        save_dir = f'{save_dir_all}/{key}'
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        if not os.path.exists(f'{save_recog_dir}/{key}'):
            os.mkdir(f'{save_recog_dir}/{key}')
            
        video_paths_dict = all_video_paths[key]
        video_ids = sorted(video_paths_dict.keys())
        for video_id in tqdm(video_ids):
            video_path = video_paths_dict[video_id]
            save_path = f'{save_dir}/{video_id}'
            convert_video_to_audio_ffmpeg(video_path, save_path)
            
            output = pipeline(f'{save_path}.wav')
            audio_shots = []
            for speech in output.get_timeline().support():
                audio_shots.append([speech.start, speech.end])
            
            with open(f'{save_path}.json', 'w') as f:
                json.dump(audio_shots, f)
                
            audio_path = f'{save_path}.wav'

            speech, sampling_rate = librosa.load(audio_path, mono=True, sr=16000)
            speech = speech.astype('float64')
            speech_len = len(speech)
                
            torch.cuda.empty_cache()
            results = []
            for audio_shot in audio_shots:
                start, end = audio_shot
                lst_audio_frames = []
                while (end-start) >= 1:
                    if (end-start) <= 10:
                        lst_audio_frames.append(speech[int(start*sampling_rate):min(speech_len, round(end*sampling_rate))])
                        break
                    else:
                        lst_audio_frames.append(speech[int(start*sampling_rate):min(speech_len, round((start+10)*sampling_rate))])
                        start = start+10
                if lst_audio_frames != []:
                    input_values = processor(lst_audio_frames, sampling_rate=sampling_rate, return_tensors="pt", padding="longest").input_values.to(device)
                    logits = model(input_values).logits
                    result = []
                    for logit in logits:
                        beam_search_output = ngram_lm_model.decode(logit.cpu().detach().numpy(), beam_width=500)
                        result.append(beam_search_output)
                    result = " ".join(result)
                    results.append(result)
                else:
                    results.append("")

            with open(f'{save_recog_dir}/{key}/{video_id}.json', 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False)
            
            delete_wav_file(save_dir)

    print("Converting Videos to Audio Completed")

In [None]:
video_dir = "/kaggle/input/video-v3-aic2024"
save_dir_all = '/kaggle/working'
save_recog_dir = '/kaggle/working/audio_recognized'

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
extraction(video_dir, save_dir_all, save_recog_dir)