In [60]:
import pickle 

import pandas as pd
from pathlib import Path
import pandas as pd
import numpy as np

import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, AutoConfig, AudioClassificationPipeline

from tqdm.auto import tqdm
from sklearn.neighbors import KNeighborsClassifier

In [61]:
"""
TalTechNLP/voxlingua107-epaca-tdnn - 0.66
speechbrain/google_speech_command_xvector - 0.90
anton-l/distilhubert-ft-common-language - 0
anton-l/sew-d-mid-400k-ft-keyword-spotting - >300 MB


w11wo/distil-wav2vec2-adult-child-cls-v3 - 199MB 
anantoj/distil-wav2vec2-adult-child-cls - 144MB
ntu-spml/distilhubert - 89MB
"""

'\nTalTechNLP/voxlingua107-epaca-tdnn - 0.66\nspeechbrain/google_speech_command_xvector - 0.90\nanton-l/distilhubert-ft-common-language - 0\nanton-l/sew-d-mid-400k-ft-keyword-spotting - >300 MB\n\n\nw11wo/distil-wav2vec2-adult-child-cls-v3 - 199MB \nanantoj/distil-wav2vec2-adult-child-cls - 144MB\nntu-spml/distilhubert - 89MB\n'

In [62]:
DATA_PATH = Path("../data")
WEIGHTS_PATH = Path("speechbrain/google_speech_command_xvector")
EXP_NAME = WEIGHTS_PATH.name
DEVICE = "cuda"

CLASSES = {
    'yes': 0, 
    'no': 1, 
    'up': 2, 
    'down': 3, 
    'left': 4, 
    'right': 5, 
    'on': 6, 
    'off': 7, 
    'stop': 8, 
    'go': 9
}
INV_CLASSES = dict([[v, k] for k, v in CLASSES.items()])
cos_sim = torch.nn.CosineSimilarity(dim=1)
N_NEIGHBORS = 12

## Speechbrain

In [24]:
enc_classifier = EncoderClassifier.from_hparams(
    source=WEIGHTS_PATH,
    savedir=Path("pretrained_models") / EXP_NAME,
    run_opts={"device": DEVICE},
)
audio_normalizer = enc_classifier.audio_normalizer
label_encoder = enc_classifier.hparams.label_encoder

In [25]:
def load_audio(path):
    signal, sr = torchaudio.load(str(path), channels_first=False)
    return audio_normalizer(signal, sr)

In [63]:
classifier = KNeighborsClassifier(
    n_neighbors=N_NEIGHBORS,
    metric="cosine",
    n_jobs=-1,
)
classifier

KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=12)

In [27]:
audio_filepaths = sorted(list((DATA_PATH / "train").rglob("*.wav")))
rel_length = torch.tensor([1.0])
lengths = []
classes = []
embeddings = []
for audiofile in tqdm(audio_filepaths):
    class_name = audiofile.parts[-2]

    wav = load_audio(audiofile).unsqueeze(0)
    embedding = enc_classifier.encode_batch(wav, rel_length).squeeze(0).squeeze(0)

    classes.append(class_name)
    embeddings.append(embedding.cpu().numpy())
    lengths.append(wav.shape[-1] / 16000)

  0%|          | 0/88790 [00:00<?, ?it/s]

In [64]:
min(lengths)

0.418

## AutoFeatureExtractor

In [10]:
# config = AutoConfig.from_pretrained(WEIGHTS_PATH)
# print(config)
enc_model = AutoModelForAudioClassification.from_pretrained(WEIGHTS_PATH)
enc_model.save_pretrained(Path("pretrained_models") / EXP_NAME)
enc_model.classifier = torch.nn.Linear(256, 256)
enc_model = enc_model.to(DEVICE)
print(enc_model)


SEWDForSequenceClassification(
  (sew_d): SEWDModel(
    (feature_extractor): SEWDFeatureEncoder(
      (conv_layers): ModuleList(
        (0): SEWDGroupNormConvLayer(
          (conv): Conv1d(1, 64, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): GroupNorm(64, 64, eps=1e-05, affine=True)
        )
        (1): SEWDNoLayerNormConvLayer(
          (conv): Conv1d(64, 128, kernel_size=(3,), stride=(2,), bias=False)
        )
        (2): SEWDNoLayerNormConvLayer(
          (conv): Conv1d(128, 128, kernel_size=(1,), stride=(1,), bias=False)
        )
        (3): SEWDNoLayerNormConvLayer(
          (conv): Conv1d(128, 128, kernel_size=(3,), stride=(2,), bias=False)
        )
        (4): SEWDNoLayerNormConvLayer(
          (conv): Conv1d(128, 128, kernel_size=(1,), stride=(1,), bias=False)
        )
        (5): SEWDNoLayerNormConvLayer(
          (conv): Conv1d(128, 256, kernel_size=(3,), stride=(2,), bias=False)
        )
        (6): SEWDNoLayerNormConvLayer(
       

In [5]:
audio_filepaths = sorted(list((DATA_PATH / "train").rglob("*.wav")))

classes = []
embeddings = []
for audiofile in tqdm(audio_filepaths):
    class_name = audiofile.parts[-2]

    wav, sr = torchaudio.load(audiofile)
    wav = wav.to(DEVICE)
    embedding = enc_model(wav).logits

    classes.append(class_name)
    embeddings.append(embedding.detach().cpu().numpy())

  0%|          | 0/88790 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Submit

In [7]:
train_df = pd.DataFrame(
    {
        "filepath": [str(ap) for ap in audio_filepaths],
        "category": [CLASSES[name] for name in classes],
        "embedding": embeddings 
    }
)

In [8]:
train_df.head()

Unnamed: 0,filepath,category,embedding
0,../data/train/down/0.wav,3,"[15.127331, 11.903127, 3.8200674, -0.08791563,..."
1,../data/train/down/1.wav,3,"[14.426889, 15.1516075, 3.523267, 2.2925537, 9..."
2,../data/train/down/10.wav,3,"[17.777142, 2.83203, 0.29956368, 9.412637, 12...."
3,../data/train/down/100.wav,3,"[12.743387, 12.060911, 0.9379118, 7.697887, 14..."
4,../data/train/down/1000.wav,3,"[14.446474, 8.608712, 8.189811, 10.2905855, 8...."


In [9]:
train_df.shape

(88790, 3)

In [None]:
# train_df.to_csv("../data/train_xvector.csv", index=False)

In [8]:
train_embeddings = torch.tensor(embeddings, device=DEVICE)

pred = []
test_audio_filepaths = sorted(list((DATA_PATH / "test").glob("*.wav")))
for audiofile in tqdm(test_audio_filepaths):
    # wav = load_audio(audiofile).unsqueeze(0)
    # embedding = enc_classifier.encode_batch(wav, rel_length).squeeze(0)

    wav, sr = torchaudio.load(audiofile)
    wav = wav.to(DEVICE)
    embedding = enc_model(wav).logits
    
    similarities = cos_sim(embedding, train_embeddings)
    max_similar_idx = similarities.argmax()
    class_name = classes[max_similar_idx]

    pred.append(class_name)

  0%|          | 0/29620 [00:00<?, ?it/s]

In [9]:
pd.Series(pred).value_counts()

down    29620
dtype: int64

In [13]:
pd.Series(pred).value_counts()

down     3208
stop     2967
left     2965
on       2954
up       2950
off      2932
yes      2932
no       2924
right    2916
go       2872
dtype: int64

In [10]:
sub = pd.DataFrame(data={
    "id": [a.stem for a in test_audio_filepaths],
    "category": pred,
})
sub.to_csv(f"submission_{EXP_NAME}_cos_sim.csv", index=False)

## KNN

In [66]:
# Train KNN
classifier.fit(embeddings, [CLASSES[name] for name in classes])
# save knn model
with open("knn_xvector.pkl", "wb") as fout:
    pickle.dump(classifier, fout)

In [33]:
test_audio_filepaths = sorted(list((DATA_PATH / "test").glob("*.wav")))
test_embeddings = []
for audiofile in tqdm(test_audio_filepaths):
    wav = load_audio(audiofile).unsqueeze(0)
    embedding = enc_classifier.encode_batch(wav, rel_length).squeeze(0).squeeze(0)
    test_embeddings.append(embedding.cpu().numpy())

  0%|          | 0/29620 [00:00<?, ?it/s]

In [67]:
pred = [INV_CLASSES[idx] for idx in classifier.predict(test_embeddings)]

In [68]:
sub = pd.DataFrame(data={
    "id": [a.stem for a in test_audio_filepaths],
    "category": pred,
})
sub.to_csv(f"submission_{EXP_NAME}_knn12.csv", index=False)