In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/c4gt

/content/drive/MyDrive/c4gt


In [None]:
%ls

0116_003.json  0116_003.tsv  0116_003.vtt  [0m[01;34maudio[0m/              [01;34mdmp_audio[0m/     transcription.txt
0116_003.srt   0116_003.txt  0116_003.wav  audio_metadata.csv  DMP_audio.csv


In [None]:
!pip install transformers torch datasets IPython

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCTC, AutoProcessor,Wav2Vec2Processor,Wav2Vec2ForCTC,AutoTokenizer
import torchaudio.functional as F

class Model:
  _instance=None
  DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")
  MODEL_ID="facebook/mms-1b-all"

  def __new__(cls,*args,**kwargs):

    if cls._instance is None:
      cls._instance=super().__new__(cls)
      cls._instance.model=Wav2Vec2ForCTC.from_pretrained(cls.MODEL_ID).to(cls.DEVICE)
      cls._instance.processor=Wav2Vec2Processor.from_pretrained(cls.MODEL_ID)
      cls._instance.processor.tokenizer.set_target_lang("hin")
      cls._instance.model.load_adapter("hin")
      cls._instance.tokenizer=AutoTokenizer.from_pretrained(cls.MODEL_ID)
      cls._instance.tokenizer.set_target_lang("hin")

    return cls._instance

  def tokenize(self,transcript):
    tokens=self.tokenizer.tokenize(transcript)
    token_ids=self.tokenizer.convert_tokens_to_ids(tokens)
    return token_ids

  def prepare_input(self,audio):

    input_values=self.processor(audio,return_tensors="pt").input_values
    input_values=input_values.to(self.DEVICE)
    return input_values

  def inference(self, audio):
    input_values=self.prepare_input(audio)

    with torch.no_grad():
      logits=self.model(input_values).logits

    emission=logits.cpu().detach()

    return emission[0]

In [None]:
model=Model()

Some weights of the model checkpoint at facebook/mms-1b-all were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-all and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

In [None]:
# utils
from dataclasses import dataclass

@dataclass
class Point:
  token_index:int
  time_index: int
  score: float

@dataclass
class Segment:
  label:str
  start: int
  end: int
  score: int

  def __repr__(self)->str:
    return f"{self.label}\t({self.score:4.2f}): [{self.start:5d}, {self.end:5d})"

  @property
  def length(self):
    return self.end - self.start

In [None]:
import torchaudio

def compose_graph(emission, tokens,blank_id=0):

  num_frame=emission.size(0)
  num_tokens=len(tokens)

  graph=torch.zeros((num_frame,num_tokens))
  graph[1:,0]=torch.cumsum(emission[1:,blank_id],0)
  graph[0,1:]=-float("inf")
  graph[-num_tokens+1:,0]=float("inf")

  for t in range(num_frame-1):

    graph[t+1,1:]=torch.maximum(graph[t,1:]+emission[t,blank_id],
                                graph[t,:-1]+emission[t,tokens[1:]],)

  return graph

def backtrack(graph,emission,tokens,blank_id=0):
    """
    Backtracks the probability graph and returns
    the most probable path using CTC and viterbi
    algorithm.
    Args:
        graph: Tensor of probabilities of dim (len(emission),len(tokens)).
        emmision: Tensor containig actual set probabilities.
        tokens: Tokenized transcript.
        blank_if: default 0.
    Returns:
        path: list of Point object containing most probable path.
    """

    t,j=graph.size(0)-1,graph.size(1)-1

    path=[Point(j,t,emission[t,blank_id].exp().item())]
    while j>0:

        assert t>0

        p_stay=emission[t-1,blank_id]
        p_change=emission[t-1,int(tokens[j])]

        stayed=graph[t-1,j]+p_stay
        changed=graph[t-1,j-1]+p_change

        stayed=graph[t-1,j]+p_stay
        changed=graph[t-1,j-1]+p_change

        t-=1
        if changed>stayed:
            j -=1

        prob=(p_change if changed>stayed else p_stay).exp().item()
        path.append(Point(j,t,prob))

    while t>0:
        prob=emission[t-1,blank_id].exp().item()
        path.append(Point(j,t-1,prob))
        t-=1

    return path[::-1]

def merge_repeats(path,transcript):
    i1,i2=0,0
    segments=[]
    while i1<len(path):
        while i2<len(path) and path[i1].token_index == path[i2].token_index:
            i2+=1
        score=sum(path[k].score for k in range(i1,i2))/(i2-i1)

        segments.append(
            Segment(
                transcript[path[i1].token_index],
                path[i1].time_index,
                path[i2-1].time_index+1,
                score,
            )
        )
        i1=i2

    return segments

def merge_words(segments, separator="|"):
    words = []
    i1, i2 = 0, 0
    while i1 < len(segments):
        if i2 >= len(segments) or segments[i2].label == separator:
            if i1 != i2:
                segs = segments[i1:i2]
                word = "".join([seg.label for seg in segs])
                score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
                words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
            i1 = i2 + 1
            i2 = i1
        else:
            i2 += 1
    return words


def generate_segments(wave_form,graph,word_segments,sample_rate):

  ratio=wave_form.size(0)/graph.size(0)
  segments=[]

  for i in range(len(word_segments)):
    word=word_segments[i]
    x0=int(ratio*word.start)
    x1=int(ratio*word.end)
    time_interval=f"{x0/ sample_rate:.3f}-{x1/sample_rate:.3f} sec"
    audio_seg=wave_form[x0:x1]
    segments.append((word.label,time_interval,audio_seg))

  return segments



In [None]:
import numpy as np
import subprocess

SAMPLE_RATE=16000

def load_audio(file:str,sr: int=SAMPLE_RATE):

    try:

        cmd = [
            "ffmpeg",
            "-nostdin",
            "-threads",
            "0",
            "-i",
            file,
            "-f",
            "s16le",
            "-ac",
            "1",
            "-acodec",
            "pcm_s16le",
            "-ar",
            str(sr),
            "-",
        ]
        out=subprocess.run(cmd,capture_output=True,check=True).stdout

    except subprocess.CalledProcessError as e:

        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

    return np.frombuffer(out,np.int16).flatten().astype(np.float32)


In [None]:
import pandas as pd
df=pd.read_csv("audio_metadata.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,audio_path,transcripts
0,0,audio/0116_003.wav,और अपने पेट को माँ की स्वादिष्ट गरमगरम जलेबिया...
1,1,audio/0116_008.wav,और अपने पेट को माँ की स्वादिष्ट गरमगरम जलेबिया...
2,2,audio/0116_025.wav,और अपने पेट को माँ की स्वादिष्ट गरमगरम जलेबिया...
3,3,audio/0116_036.wav,और अपने पेट को माँ की स्वादिष्ट गरमगरम जलेबिया...
4,4,audio/0116_061.wav,और अपने पेट को माँ की स्वादिष्ट गरमगरम जलेबिया...


In [None]:
audio=load_audio(df["audio_path"][15])
transcript=df["transcripts"][15]

In [None]:
token_ids=model.tokenize(transcript)

In [None]:
emission=model.inference(audio)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [None]:
graph=compose_graph(emission,token_ids)

In [None]:
graph

tensor([[ 0.0000e+00,        -inf,        -inf,  ...,        -inf,
                -inf,        -inf],
        [ 1.1257e+01, -1.2005e+00,        -inf,  ...,        -inf,
                -inf,        -inf],
        [ 2.2291e+01,  1.0727e+01, -2.1313e+00,  ...,        -inf,
                -inf,        -inf],
        ...,
        [        inf,         inf,         inf,  ...,  2.8292e+03,
          2.8363e+03,  2.8439e+03],
        [        inf,         inf,         inf,  ...,         inf,
          2.8378e+03,  2.8454e+03],
        [        inf,         inf,         inf,  ...,         inf,
                 inf,  2.8445e+03]])

In [None]:
path=backtrack(graph,emission,token_ids)

In [None]:
transcript

'मुनिया ने उन्हें मछली पकड़ने की बंसीे ले कर जाते हुए देखा'

In [None]:
segments = merge_repeats(path,"मुनिया|ने|उन्हें|मछली|पकड़ने|की|बंसीे|ले|कर|जाते|हुए|देखा")

In [None]:
word_segments=merge_words(segments)

In [None]:
import IPython
def display_segmenat(i):
    ratio = audio.shpe[0] / graph.size(0)
    word = word_segments[i]
    x0 = int(ratio * word.start)
    x1 = int(ratio * word.end)
    print(f"{word.label} ({word.score:.2f}): {x0 / 16000:.3f} - {x1 / 16000:.3f} sec")
    segment = audio[x0:x1]
    return IPython.display.Audio(segment, rate=16000)


In [None]:
word_segments

[मुनिया	(21336.01): [    0,    55),
 ने	(45740.64): [   57,    63),
 उन्हें	(178747.00): [   65,    83),
 मछली	(21888.93): [   85,   104),
 पकड़ने	(34218.14): [  106,   129),
 की	(94118.48): [  131,   140),
 बंसीे	(13848.72): [  143,   161),
 ले	(137997.32): [  162,   168),
 कर	(91333.66): [  169,   178),
 जाते	(95343.97): [  179,   194),
 हुए	(4944.21): [  196,   204),
 देखा	(80843.79): [  206,   219)]

In [None]:
display_segment(11)

देखा (80843.79): 4.139 - 4.400 sec


In [None]:
model.processor.tokenizer.convert_ids_to_tokens(tokens)

['औ',
 'र',
 '|',
 'अ',
 'प',
 'न',
 'े',
 '|',
 'प',
 'े',
 'ट',
 '|',
 'क',
 'ो',
 '|',
 'म',
 'ा',
 'ँ',
 '|',
 'क',
 'ी',
 '|',
 'स',
 '्',
 'व',
 'ा',
 'द',
 'ि',
 'ष',
 '्',
 'ट',
 '|',
 'ग',
 'र',
 'म',
 'ग',
 'र',
 'म',
 '|',
 'ज',
 'ल',
 'े',
 'ब',
 'ि',
 'य',
 'ा',
 'ँ',
 '|',
 'ह',
 '<unk>',
 'प',
 'त',
 'े']