### Dataset

http://bark.phon.ioc.ee/voxlingua107/

### Model and toolkit

Architecture: https://arxiv.org/pdf/2005.07143.pdf

Toolkit: https://speechbrain.github.io

### HuggingFace reference

https://huggingface.co/speechbrain/lang-id-voxlingua107-ecapa

### Additional info

More specific language id task dataset, but for text: https://huggingface.co/datasets/papluca/language-identification#additional-information

In [110]:
import torchaudio
from speechbrain.pretrained import EncoderClassifier
from typing import List, Tuple, Dict
import numpy as np
import torch

In [99]:
# add run_opts={"device":"cuda"} parameter to run on CUDA

model = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="ckpt")

In [100]:
import os

DATA_DIR = 'data'

wav_filenames = [file for file in os.listdir(DATA_DIR) if file.endswith('.wav')]

In [101]:
print(wav_filenames)

['jamie_Korean.wav', 'anders_English.wav', 'kazka_Ukrainian.wav', 'kleine_Dutch.wav', 'reezy_German.wav', 'by-india_Russian.wav', 'speech_Russian.wav', 'stromae_French.wav', 'miki_Japanese.wav', 'disfruto_Spanish.wav']


In [102]:
with open('labels.txt', 'r') as f:
    labels = [line.strip() for line in f.readlines()]
    
# the first label is 'multilingual', model wasn't pretrained on that class
labels = labels[1:]

In [105]:
import librosa
import librosa.display
import matplotlib.pyplot as plt

def plot_mel_spectrogram(wav_file_path: str):
  y, sr = librosa.load(wav_file_path, sr=None)

  S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)

  log_S = librosa.power_to_db(S, ref=np.max)

  plt.figure(figsize=(10, 4))
  librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
  plt.title('Mel spectrogram')
  plt.colorbar(format='%+02.0f dB')
  plt.tight_layout()
  plt.show()

def get_ground_truth_labels(wav_names: List[str]):
  annotated_data = []
  for wav_name in wav_names:
    name_to_gt = {}
    gt = wav_name.split('_')[1].split('.')[0]
    
    name_to_gt['filename'] = wav_name
    name_to_gt['label'] = gt
    
    annotated_data.append(name_to_gt)
  return annotated_data

def make_top_5_prediction_from_sample(samples: List[str]):
  preds = []
  for sample in samples:
    name_to_pred = {}
    
    wav = model.load_audio(f"data/{sample}")
    prediction = model.classify_batch(wav)
    
    log_likelihoods = prediction[0].squeeze()
    top_5_idc = torch.argsort(log_likelihoods, descending=True)[:5]
    top_5_preds = [labels[ix] for ix in top_5_idc]
    
    name_to_pred["filename"] = sample
    name_to_pred["prediction"] = top_5_preds
    
    preds.append(name_to_pred)
  return preds

In [106]:
wavname_to_gt = get_ground_truth_labels(wav_filenames)

In [107]:
wavname_to_gt

[{'filename': 'jamie_Korean.wav', 'label': 'Korean'},
 {'filename': 'anders_English.wav', 'label': 'English'},
 {'filename': 'kazka_Ukrainian.wav', 'label': 'Ukrainian'},
 {'filename': 'kleine_Dutch.wav', 'label': 'Dutch'},
 {'filename': 'reezy_German.wav', 'label': 'German'},
 {'filename': 'by-india_Russian.wav', 'label': 'Russian'},
 {'filename': 'speech_Russian.wav', 'label': 'Russian'},
 {'filename': 'stromae_French.wav', 'label': 'French'},
 {'filename': 'miki_Japanese.wav', 'label': 'Japanese'},
 {'filename': 'disfruto_Spanish.wav', 'label': 'Spanish'}]

In [108]:
preds = make_top_5_prediction_from_sample(wav_filenames)

In [109]:
preds

[{'filename': 'jamie_Korean.wav',
  'prediction': ['Sanskrit', 'Tatar', 'Vietnamese', 'Tibetan', 'Polish']},
 {'filename': 'anders_English.wav',
  'prediction': ['Lithuanian',
   'Esperanto',
   'Burmese',
   'Hungarian',
   'Portuguese']},
 {'filename': 'kazka_Ukrainian.wav',
  'prediction': ['Portuguese', 'Sanskrit', 'Ukrainian', 'Russian', 'Bislama']},
 {'filename': 'kleine_Dutch.wav',
  'prediction': ['Dutch', 'Icelandic', 'Danish', 'English', 'Hungarian']},
 {'filename': 'reezy_German.wav',
  'prediction': ['Luxembourgish', 'Macedonian', 'German', 'Yiddish', 'Dutch']},
 {'filename': 'by-india_Russian.wav',
  'prediction': ['Romanian', 'Turkish', 'Polish', 'Russian', 'Ukrainian']},
 {'filename': 'speech_Russian.wav',
  'prediction': ['Russian',
   'Belarusian',
   'Ukrainian',
   'Slovak',
   'Lithuanian']},
 {'filename': 'stromae_French.wav',
  'prediction': ['French', 'Ukrainian', 'Tatar', 'Haitian', 'Norwegian']},
 {'filename': 'miki_Japanese.wav',
  'prediction': ['Burmese', 'T

In [None]:
plot_mel_spectrogram(f'data/{wav_files[0]}')