# Running MMS-LID inference in Colab

## Step 1: Clone fairseq-py and install latest version

In [1]:
import os

!git clone https://github.com/pytorch/fairseq

# Change current working directory
!pwd
%cd "/content/fairseq"
!pip install --editable ./
!pip install tensorboardX


Cloning into 'fairseq'...
remote: Enumerating objects: 35391, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 35391 (delta 8), reused 5 (delta 5), pack-reused 35372 (from 2)[K
Receiving objects: 100% (35391/35391), 25.48 MiB | 11.93 MiB/s, done.
Resolving deltas: 100% (25541/25541), done.
/content
/content/fairseq
Obtaining file:///content/fairseq
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq==0.12.2)
  Downloading hydra_core-1.0.7-py3-none-any.whl.metadata (3.7 kB)
Collecting omegaconf<2.1 (from fairseq==0.12.2)
  Downloading omegaconf-2.0.6-py3-none-any.whl.metadata (3.0 kB)
Requested omegaconf<2.1 from https://files.pythonhosted.org/packages/d0/eb/9d63ce

## 2. Download MMS-LID model



In [2]:
available_models = ["l126", "l256", "l512", "l1024", "l2048", "l4017"]

# We will use L126 model which can recognize 126 languages
model_name = available_models[0] # l126
print(f"Using model - {model_name}")
print(f"Visit https://dl.fbaipublicfiles.com/mms/lid/mms1b_{model_name}_langs.html to check all the languages supported by this model.")

! mkdir -p /content/models_lid
!wget -P /content/models_lid/{model_name} 'https://dl.fbaipublicfiles.com/mms/lid/mms1b_{model_name}.pt'
!wget -P /content/models_lid/{model_name} 'https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt'



Using model - l126
Visit https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126_langs.html to check all the languages supported by this model.
--2025-06-02 21:22:24--  https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126.pt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.108, 3.163.189.14, 3.163.189.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3856229421 (3.6G) [binary/octet-stream]
Saving to: ‘/content/models_lid/l126/mms1b_l126.pt’


2025-06-02 21:24:06 (35.8 MB/s) - ‘/content/models_lid/l126/mms1b_l126.pt’ saved [3856229421/3856229421]

--2025-06-02 21:24:07--  https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.96, 3.163.189.108, 3.163.189.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.96|:443... connected.
HTTP request sent, awaiting response

## 3. Prepare manifest files
Create a folder on path '/content/audio_samples/' and upload your .wav audio files that you need to recognize e.g. '/content/audio_samples/abc.wav' , '/content/audio_samples/def.wav' etc...

Note: You need to make sure that the audio data you are using has a sample rate of 16kHz You can easily do this with FFMPEG like the example below that converts .mp3 file to .flac and fixing the audio sample rate

Here, we use three examples - one audio file from English, Hindi, Chinese each.

In [3]:
! mkdir -p /content/audio_samples/
for key in ["en_us", "hi_in", "cmn_hans_cn"]:
  !wget -O /content/audio_samples/tmp.mp3 https://datasets-server.huggingface.co/assets/google/fleurs/--/{key}/train/0/audio/audio.mp3
  !ffmpeg -hide_banner -loglevel error -y -i   /content/audio_samples/tmp.mp3 -ar 16000 /content/audio_samples/{key}.wav

! mkdir -p /content/audio_samples/


--2025-06-02 21:24:47--  https://datasets-server.huggingface.co/assets/google/fleurs/--/en_us/train/0/audio/audio.mp3
Resolving datasets-server.huggingface.co (datasets-server.huggingface.co)... 13.224.14.92, 13.224.14.109, 13.224.14.100, ...
Connecting to datasets-server.huggingface.co (datasets-server.huggingface.co)|13.224.14.92|:443... connected.
HTTP request sent, awaiting response... 403 Forbidden
2025-06-02 21:24:47 ERROR 403: Forbidden.

[0;35m[mp3 @ 0x5575ed000040] [0m[1;31mFailed to read frame size: Could not seek to 1026.
[0m[1;31m/content/audio_samples/tmp.mp3: Invalid argument
[0m--2025-06-02 21:24:48--  https://datasets-server.huggingface.co/assets/google/fleurs/--/hi_in/train/0/audio/audio.mp3
Resolving datasets-server.huggingface.co (datasets-server.huggingface.co)... 13.224.14.92, 13.224.14.109, 13.224.14.100, ...
Connecting to datasets-server.huggingface.co (datasets-server.huggingface.co)|13.224.14.92|:443... connected.
HTTP request sent, awaiting response... 4

In [4]:
! mkdir -p /content/manifest/
import os
with open("/content/manifest/dev.tsv", "w") as ftsv, open("/content/manifest/dev.lang", "w") as flang:
  ftsv.write("/\n")

  for fl in os.listdir("/content/audio_samples/"):
    if not fl.endswith(".wav"):
      continue
    audio_path = f"/content/audio_samples/{fl}"
    # duration should be number of samples in audio. For inference, using a random value should be fine.
    duration = 1234
    ftsv.write(f"{audio_path}\t{duration}\n")
    flang.write("eng\n") # This is the "true" language for the audio. For inference, using a random value should be fine.


# 4: Run Inference and transcribe your audio(s)


In [5]:
import os

os.environ["PYTHONPATH"] = "/content/fairseq"
os.environ["PREFIX"] = "INFER"
os.environ["HYDRA_FULL_ERROR"] = "1"
os.environ["USER"] = "mms_lid_user"

!python3 examples/mms/lid/infer.py /content/models_lid/{model_name} --path /content/models_lid/{model_name}/mms1b_l126.pt \
  --task audio_classification  --infer-manifest /content/manifest/dev.tsv --output-path /content/manifest/

Traceback (most recent call last):
  File "/content/fairseq/examples/mms/lid/infer.py", line 2, in <module>
    from fairseq.data.text_compressor import TextCompressionLevel, TextCompressor
  File "/content/fairseq/fairseq/__init__.py", line 20, in <module>
    from fairseq.distributed import utils as distributed_utils
  File "/content/fairseq/fairseq/distributed/__init__.py", line 7, in <module>
    from .fully_sharded_data_parallel import (
  File "/content/fairseq/fairseq/distributed/fully_sharded_data_parallel.py", line 10, in <module>
    from fairseq.dataclass.configs import DistributedTrainingConfig
  File "/content/fairseq/fairseq/dataclass/__init__.py", line 6, in <module>
    from .configs import FairseqDataclass
  File "/content/fairseq/fairseq/dataclass/configs.py", line 1127, in <module>
    @dataclass
     ^^^^^^^^^
  File "/usr/lib/python3.11/dataclasses.py", line 1232, in dataclass
    return wrap(cls)
           ^^^^^^^^^
  File "/usr/lib/python3.11/dataclasses.py", li

In [None]:
print("----- INPUT FILES -----")
! tail -n +2 /content/manifest/dev.tsv

print("\n----- TOP-K PREDICTONS WITH SCORE -----")
! cat /content/manifest//predictions.txt

----- INPUT FILES -----
/content/audio_samples/hi_in.wav	1234
/content/audio_samples/en_us.wav	1234
/content/audio_samples/cmn_hans_cn.wav	1234

----- TOP-K PREDICTONS WITH SCORE -----
[["hin", 0.9931250810623169], ["urd", 0.005808886140584946], ["snd", 0.0005312535213306546]]
[["eng", 0.9989539980888367], ["fas", 0.00036296260077506304], ["haw", 7.031611312413588e-05]]
[["cmn", 0.9996059536933899], ["bod", 0.0002111078501911834], ["kor", 9.211552242049947e-05]]


In [7]:
%pwd
!git clone https://github.com/jaywalnut310/vits.git
!python --version
%cd vits/

!pip install Cython==0.29.21
!pip install librosa==0.8.0
!pip install phonemizer==2.2.1
!pip install scipy
!pip install numpy
!pip install torch
!pip install torchvision
!pip install matplotlib
!pip install Unidecode==1.1.1

%cd monotonic_align/
%mkdir monotonic_align
!python3 setup.py build_ext --inplace
%cd ../
%pwd

Cloning into 'vits'...
remote: Enumerating objects: 81, done.[K
remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 81 (from 1)[K
Receiving objects: 100% (81/81), 3.33 MiB | 19.40 MiB/s, done.
Resolving deltas: 100% (22/22), done.
Python 3.11.12
/content/fairseq/vits
Collecting Cython==0.29.21
  Downloading Cython-0.29.21-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading Cython-0.29.21-py2.py3-none-any.whl (974 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.2/974.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Cython
  Attempting uninstall: Cython
    Found existing installation: Cython 3.0.12
    Uninstalling Cython-3.0.12:
      Successfully uninstalled Cython-3.0.12
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fairseq 0.12.2 requires bitarray, which is not installed.
fairseq 0.12

'/content/fairseq/vits'

In [8]:
import os
import subprocess
import locale
locale.getpreferredencoding = lambda: "UTF-8"

def download(lang, tgt_dir="./"):
  lang_fn, lang_dir = os.path.join(tgt_dir, lang+'.tar.gz'), os.path.join(tgt_dir, lang)
  cmd = ";".join([
        f"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}",
        f"tar zxvf {lang_fn}"
  ])
  print(f"Download model for language: {lang}")
  subprocess.check_output(cmd, shell=True)
  print(f"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}")
  return lang_dir

LANG = "eng"
ckpt_dir = download(LANG)

Download model for language: eng
Model checkpoints in ./eng: ['config.json', 'G_100000.pth', 'vocab.txt']


In [11]:
from IPython.display import Audio
import os
import re
import glob
import json
import tempfile
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import numpy as np
import commons
import utils
import argparse
import subprocess
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from scipy.io.wavfile import write

def preprocess_char(text, lang=None):
    """
    Special treatement of characters in certain languages
    """
    print(lang)
    if lang == 'ron':
        text = text.replace("ț", "ţ")
    return text

class TextMapper(object):
    def __init__(self, vocab_file):
        self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
        self.SPACE_ID = self.symbols.index(" ")
        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

    def text_to_sequence(self, text, cleaner_names):
        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
        Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through
        Returns:
        List of integers corresponding to the symbols in the text
        '''
        sequence = []
        clean_text = text.strip()
        for symbol in clean_text:
            symbol_id = self._symbol_to_id[symbol]
            sequence += [symbol_id]
        return sequence

    def uromanize(self, text, uroman_pl):
        iso = "xxx"
        with tempfile.NamedTemporaryFile() as tf, \
             tempfile.NamedTemporaryFile() as tf2:
            with open(tf.name, "w") as f:
                f.write("\n".join([text]))
            cmd = f"perl " + uroman_pl
            cmd += f" -l {iso} "
            cmd +=  f" < {tf.name} > {tf2.name}"
            os.system(cmd)
            outtexts = []
            with open(tf2.name) as f:
                for line in f:
                    line =  re.sub(r"\s+", " ", line).strip()
                    outtexts.append(line)
            outtext = outtexts[0]
        return outtext

    def get_text(self, text, hps):
        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
        if hps.data.add_blank:
            text_norm = commons.intersperse(text_norm, 0)
        text_norm = torch.LongTensor(text_norm)
        return text_norm

    def filter_oov(self, text):
        val_chars = self._symbol_to_id
        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
        print(f"text after filtering OOV: {txt_filt}")
        return txt_filt

def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
    txt = preprocess_char(txt, lang=lang)
    is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
    if is_uroman:
        with tempfile.TemporaryDirectory() as tmp_dir:
            if uroman_dir is None:
                cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
                print(cmd)
                subprocess.check_output(cmd, shell=True)
                uroman_dir = tmp_dir
            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
            print(f"uromanize")
            txt = text_mapper.uromanize(txt, uroman_pl)
            print(f"uroman text: {txt}")
    txt = txt.lower()
    txt = text_mapper.filter_oov(txt)
    return txt

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
hps = utils.get_hparams_from_file(config_file)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)

AttributeError: module 'numpy' has no attribute 'complex'.
`np.complex` was a deprecated alias for the builtin `complex`. To avoid this error in existing code, use `complex` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.complex128` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [6]:
txt = "Expanding the language coverage of speech technology has the potential to improve access to information for many more people"

print(f"text: {txt}")
txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
stn_tst = text_mapper.get_text(txt, hps)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    hyp = net_g.infer(
        x_tst, x_tst_lengths, noise_scale=.667,
        noise_scale_w=0.8, length_scale=1.0
    )[0][0,0].cpu().float().numpy()

print(f"Generated audio")
Audio(hyp, rate=hps.data.sampling_rate)

Traceback (most recent call last):
  File "/content/fairseq/examples/mms/tts/infer.py", line 17, in <module>
    import commons
ModuleNotFoundError: No module named 'commons'
